diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja b/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8121dc73b9785dee6317a843d22bea2f42917744 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json @@ -0,0 +1,1343 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 187, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0053475935828877, + "grad_norm": 5.576223850250244, + "learning_rate": 5e-06, + "loss": 0.9394, + "step": 1 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 41.748443603515625, + "learning_rate": 4.99999647201733e-06, + "loss": 2.0122, + "step": 2 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 10.106061935424805, + "learning_rate": 4.999985888079276e-06, + "loss": 1.0092, + "step": 3 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 12.377921104431152, + "learning_rate": 4.999968248215712e-06, + "loss": 1.5196, + "step": 4 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 7.258418560028076, + "learning_rate": 4.999943552476422e-06, + "loss": 1.4586, + "step": 5 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 5.282329559326172, + "learning_rate": 4.999911800931108e-06, + "loss": 1.1068, + "step": 6 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 3.468794822692871, + "learning_rate": 4.999872993669387e-06, + "loss": 0.8997, + "step": 7 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 3.2200160026550293, + "learning_rate": 4.999827130800785e-06, + "loss": 1.075, + "step": 8 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 21.746450424194336, + "learning_rate": 4.999774212454746e-06, + "loss": 1.691, + "step": 9 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 11.011313438415527, + "learning_rate": 4.999714238780626e-06, + "loss": 1.3167, + "step": 10 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 5.002156734466553, + "learning_rate": 4.999647209947694e-06, + "loss": 0.9653, + "step": 11 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 12.305068016052246, + "learning_rate": 4.999573126145132e-06, + "loss": 1.2992, + "step": 12 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 5.660033702850342, + "learning_rate": 4.999491987582032e-06, + "loss": 0.9204, + "step": 13 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 5.366727828979492, + "learning_rate": 4.999403794487399e-06, + "loss": 1.3307, + "step": 14 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 3.265700578689575, + "learning_rate": 4.999308547110147e-06, + "loss": 0.8596, + "step": 15 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 3.0776281356811523, + "learning_rate": 4.9992062457191005e-06, + "loss": 0.9614, + "step": 16 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.8679943084716797, + "learning_rate": 4.999096890602996e-06, + "loss": 0.8, + "step": 17 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.785064935684204, + "learning_rate": 4.998980482070473e-06, + "loss": 0.7683, + "step": 18 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 4.619974613189697, + "learning_rate": 4.998857020450084e-06, + "loss": 1.2742, + "step": 19 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 2.894366979598999, + "learning_rate": 4.998726506090283e-06, + "loss": 0.8559, + "step": 20 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 3.4240734577178955, + "learning_rate": 4.998588939359435e-06, + "loss": 0.8223, + "step": 21 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 4.5151777267456055, + "learning_rate": 4.998444320645803e-06, + "loss": 1.1229, + "step": 22 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 2.7780518531799316, + "learning_rate": 4.998292650357558e-06, + "loss": 0.8936, + "step": 23 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 3.7252511978149414, + "learning_rate": 4.998133928922773e-06, + "loss": 1.2552, + "step": 24 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 4.296158313751221, + "learning_rate": 4.99796815678942e-06, + "loss": 0.7075, + "step": 25 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 5.5546956062316895, + "learning_rate": 4.997795334425372e-06, + "loss": 0.9781, + "step": 26 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 3.685818910598755, + "learning_rate": 4.997615462318403e-06, + "loss": 1.0657, + "step": 27 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 3.5500221252441406, + "learning_rate": 4.997428540976177e-06, + "loss": 0.951, + "step": 28 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 13.312395095825195, + "learning_rate": 4.997234570926263e-06, + "loss": 0.6788, + "step": 29 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 2.6344847679138184, + "learning_rate": 4.997033552716116e-06, + "loss": 0.8, + "step": 30 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 3.0757298469543457, + "learning_rate": 4.9968254869130885e-06, + "loss": 0.7625, + "step": 31 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 4.064891815185547, + "learning_rate": 4.996610374104422e-06, + "loss": 0.7381, + "step": 32 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 7.529796123504639, + "learning_rate": 4.9963882148972475e-06, + "loss": 1.3283, + "step": 33 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 3.2115354537963867, + "learning_rate": 4.996159009918586e-06, + "loss": 1.0002, + "step": 34 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 4.122320652008057, + "learning_rate": 4.9959227598153395e-06, + "loss": 0.9095, + "step": 35 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 54.98562240600586, + "learning_rate": 4.9956794652542994e-06, + "loss": 1.2191, + "step": 36 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 3.083123207092285, + "learning_rate": 4.9954291269221364e-06, + "loss": 0.7424, + "step": 37 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 15.99591064453125, + "learning_rate": 4.995171745525401e-06, + "loss": 0.9289, + "step": 38 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 5.214310169219971, + "learning_rate": 4.994907321790524e-06, + "loss": 0.991, + "step": 39 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 3.4376749992370605, + "learning_rate": 4.994635856463811e-06, + "loss": 0.6406, + "step": 40 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 4.30764102935791, + "learning_rate": 4.994357350311441e-06, + "loss": 1.2038, + "step": 41 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 3.5810039043426514, + "learning_rate": 4.994071804119467e-06, + "loss": 0.9696, + "step": 42 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 4.080881595611572, + "learning_rate": 4.993779218693811e-06, + "loss": 1.1579, + "step": 43 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 3.1389286518096924, + "learning_rate": 4.99347959486026e-06, + "loss": 0.7118, + "step": 44 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 2.6397321224212646, + "learning_rate": 4.99317293346447e-06, + "loss": 0.7579, + "step": 45 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 2.9469995498657227, + "learning_rate": 4.992859235371958e-06, + "loss": 0.7105, + "step": 46 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 2.669086456298828, + "learning_rate": 4.992538501468101e-06, + "loss": 0.6812, + "step": 47 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 5.421566009521484, + "learning_rate": 4.992210732658132e-06, + "loss": 0.9733, + "step": 48 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 4.413289546966553, + "learning_rate": 4.991875929867143e-06, + "loss": 1.1301, + "step": 49 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 3.1602351665496826, + "learning_rate": 4.991534094040077e-06, + "loss": 0.6706, + "step": 50 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 4.374372959136963, + "learning_rate": 4.991185226141726e-06, + "loss": 0.9462, + "step": 51 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 2.9649057388305664, + "learning_rate": 4.990829327156729e-06, + "loss": 1.0714, + "step": 52 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 3.1991283893585205, + "learning_rate": 4.990466398089571e-06, + "loss": 0.9175, + "step": 53 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 2.580082654953003, + "learning_rate": 4.99009643996458e-06, + "loss": 0.5164, + "step": 54 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 3.9115707874298096, + "learning_rate": 4.989719453825918e-06, + "loss": 0.7223, + "step": 55 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 2.825481414794922, + "learning_rate": 4.989335440737587e-06, + "loss": 0.7065, + "step": 56 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 2.8599696159362793, + "learning_rate": 4.9889444017834185e-06, + "loss": 0.8833, + "step": 57 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 2.885662078857422, + "learning_rate": 4.988546338067078e-06, + "loss": 0.8664, + "step": 58 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 3.187185764312744, + "learning_rate": 4.988141250712053e-06, + "loss": 0.884, + "step": 59 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 3.7545692920684814, + "learning_rate": 4.987729140861657e-06, + "loss": 0.9891, + "step": 60 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 3.0581002235412598, + "learning_rate": 4.987310009679023e-06, + "loss": 0.8838, + "step": 61 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 2.8039402961730957, + "learning_rate": 4.986883858347101e-06, + "loss": 0.8188, + "step": 62 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 3.01231050491333, + "learning_rate": 4.986450688068655e-06, + "loss": 0.6032, + "step": 63 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 2.7969677448272705, + "learning_rate": 4.986010500066258e-06, + "loss": 0.7623, + "step": 64 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 2.757786989212036, + "learning_rate": 4.985563295582292e-06, + "loss": 0.8051, + "step": 65 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.9582571983337402, + "learning_rate": 4.98510907587894e-06, + "loss": 0.7901, + "step": 66 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 3.104294776916504, + "learning_rate": 4.984647842238185e-06, + "loss": 1.0582, + "step": 67 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 2.7413785457611084, + "learning_rate": 4.984179595961806e-06, + "loss": 0.5912, + "step": 68 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 2.722858190536499, + "learning_rate": 4.983704338371375e-06, + "loss": 0.7855, + "step": 69 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 2.5095815658569336, + "learning_rate": 4.983222070808255e-06, + "loss": 0.6491, + "step": 70 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 2.97511887550354, + "learning_rate": 4.982732794633588e-06, + "loss": 0.9735, + "step": 71 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 3.5139546394348145, + "learning_rate": 4.982236511228301e-06, + "loss": 0.8495, + "step": 72 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 3.086568593978882, + "learning_rate": 4.981733221993099e-06, + "loss": 1.0891, + "step": 73 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 3.490666389465332, + "learning_rate": 4.981222928348456e-06, + "loss": 0.8013, + "step": 74 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 3.3275415897369385, + "learning_rate": 4.98070563173462e-06, + "loss": 0.8298, + "step": 75 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 2.7193403244018555, + "learning_rate": 4.980181333611601e-06, + "loss": 0.6989, + "step": 76 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 2.5338242053985596, + "learning_rate": 4.979650035459171e-06, + "loss": 0.6769, + "step": 77 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 2.7369015216827393, + "learning_rate": 4.9791117387768575e-06, + "loss": 1.0385, + "step": 78 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 2.6109988689422607, + "learning_rate": 4.978566445083942e-06, + "loss": 0.6498, + "step": 79 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 3.0895354747772217, + "learning_rate": 4.978014155919455e-06, + "loss": 0.7931, + "step": 80 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 2.6197807788848877, + "learning_rate": 4.977454872842169e-06, + "loss": 0.7322, + "step": 81 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 2.9248461723327637, + "learning_rate": 4.976888597430597e-06, + "loss": 0.9184, + "step": 82 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 2.7636630535125732, + "learning_rate": 4.976315331282985e-06, + "loss": 0.8258, + "step": 83 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 2.702061653137207, + "learning_rate": 4.9757350760173144e-06, + "loss": 0.7414, + "step": 84 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 2.3016257286071777, + "learning_rate": 4.975147833271288e-06, + "loss": 0.8573, + "step": 85 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 2.758795738220215, + "learning_rate": 4.974553604702332e-06, + "loss": 0.7271, + "step": 86 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 3.0134952068328857, + "learning_rate": 4.973952391987589e-06, + "loss": 0.8976, + "step": 87 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.668630838394165, + "learning_rate": 4.9733441968239125e-06, + "loss": 1.0753, + "step": 88 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 2.5940303802490234, + "learning_rate": 4.972729020927866e-06, + "loss": 0.6903, + "step": 89 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 3.0423827171325684, + "learning_rate": 4.97210686603571e-06, + "loss": 0.9347, + "step": 90 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 2.5026450157165527, + "learning_rate": 4.97147773390341e-06, + "loss": 0.6738, + "step": 91 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 3.596545457839966, + "learning_rate": 4.970841626306617e-06, + "loss": 0.8356, + "step": 92 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 3.2207071781158447, + "learning_rate": 4.970198545040673e-06, + "loss": 0.9117, + "step": 93 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 2.858541965484619, + "learning_rate": 4.969548491920603e-06, + "loss": 0.8237, + "step": 94 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 2.896359920501709, + "learning_rate": 4.968891468781105e-06, + "loss": 0.8775, + "step": 95 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 3.6659083366394043, + "learning_rate": 4.968227477476554e-06, + "loss": 0.9068, + "step": 96 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 3.6469972133636475, + "learning_rate": 4.9675565198809905e-06, + "loss": 1.0435, + "step": 97 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 4.615362167358398, + "learning_rate": 4.966878597888114e-06, + "loss": 1.0084, + "step": 98 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 3.4075334072113037, + "learning_rate": 4.966193713411284e-06, + "loss": 0.7217, + "step": 99 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.965501868383507e-06, + "loss": 0.6594, + "step": 100 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 4.086977958679199, + "learning_rate": 4.964803064757438e-06, + "loss": 0.9249, + "step": 101 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.676903247833252, + "learning_rate": 4.964097304505371e-06, + "loss": 0.7776, + "step": 102 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 2.5098068714141846, + "learning_rate": 4.963384589619233e-06, + "loss": 0.6339, + "step": 103 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 4.064920902252197, + "learning_rate": 4.962664922110581e-06, + "loss": 1.0107, + "step": 104 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 2.6229960918426514, + "learning_rate": 4.9619383040105954e-06, + "loss": 1.0052, + "step": 105 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 2.857506275177002, + "learning_rate": 4.961204737370071e-06, + "loss": 0.8577, + "step": 106 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 3.9176764488220215, + "learning_rate": 4.960464224259418e-06, + "loss": 1.1237, + "step": 107 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 2.9063003063201904, + "learning_rate": 4.95971676676865e-06, + "loss": 0.6237, + "step": 108 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 3.1583969593048096, + "learning_rate": 4.958962367007381e-06, + "loss": 0.9135, + "step": 109 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.7559218406677246, + "learning_rate": 4.958201027104818e-06, + "loss": 0.7461, + "step": 110 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 11.086910247802734, + "learning_rate": 4.957432749209755e-06, + "loss": 0.69, + "step": 111 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 3.8109939098358154, + "learning_rate": 4.95665753549057e-06, + "loss": 0.8578, + "step": 112 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 3.3317348957061768, + "learning_rate": 4.9558753881352165e-06, + "loss": 1.3098, + "step": 113 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 2.715823173522949, + "learning_rate": 4.955086309351213e-06, + "loss": 0.9979, + "step": 114 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 2.798602819442749, + "learning_rate": 4.9542903013656485e-06, + "loss": 0.6298, + "step": 115 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 32.90562438964844, + "learning_rate": 4.953487366425163e-06, + "loss": 0.959, + "step": 116 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 4.012441158294678, + "learning_rate": 4.952677506795949e-06, + "loss": 0.6791, + "step": 117 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 3.548151731491089, + "learning_rate": 4.951860724763743e-06, + "loss": 0.7783, + "step": 118 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 3.4778249263763428, + "learning_rate": 4.95103702263382e-06, + "loss": 0.8085, + "step": 119 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 2.625532627105713, + "learning_rate": 4.950206402730984e-06, + "loss": 0.7702, + "step": 120 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 3.2743935585021973, + "learning_rate": 4.949368867399567e-06, + "loss": 0.602, + "step": 121 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 3.9576094150543213, + "learning_rate": 4.948524419003415e-06, + "loss": 1.2858, + "step": 122 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 3.233257532119751, + "learning_rate": 4.947673059925889e-06, + "loss": 0.7945, + "step": 123 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 2.6730406284332275, + "learning_rate": 4.9468147925698525e-06, + "loss": 0.959, + "step": 124 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 2.8612916469573975, + "learning_rate": 4.945949619357668e-06, + "loss": 0.7611, + "step": 125 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 2.9609551429748535, + "learning_rate": 4.945077542731188e-06, + "loss": 0.5753, + "step": 126 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 3.7842485904693604, + "learning_rate": 4.94419856515175e-06, + "loss": 0.8995, + "step": 127 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 3.513170003890991, + "learning_rate": 4.943312689100166e-06, + "loss": 0.9623, + "step": 128 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 2.690305471420288, + "learning_rate": 4.942419917076723e-06, + "loss": 0.6657, + "step": 129 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 2.951237440109253, + "learning_rate": 4.941520251601167e-06, + "loss": 0.7711, + "step": 130 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 2.8285868167877197, + "learning_rate": 4.940613695212702e-06, + "loss": 0.5908, + "step": 131 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.6700541973114014, + "learning_rate": 4.939700250469979e-06, + "loss": 0.967, + "step": 132 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 3.229152202606201, + "learning_rate": 4.938779919951092e-06, + "loss": 0.9519, + "step": 133 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 2.403944730758667, + "learning_rate": 4.93785270625357e-06, + "loss": 0.5873, + "step": 134 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 3.8491666316986084, + "learning_rate": 4.936918611994368e-06, + "loss": 0.8148, + "step": 135 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.8255743980407715, + "learning_rate": 4.935977639809861e-06, + "loss": 0.8286, + "step": 136 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 2.8479511737823486, + "learning_rate": 4.935029792355834e-06, + "loss": 0.6442, + "step": 137 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 2.585566759109497, + "learning_rate": 4.934075072307481e-06, + "loss": 1.0144, + "step": 138 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 2.8108413219451904, + "learning_rate": 4.933113482359388e-06, + "loss": 0.5922, + "step": 139 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 2.799546241760254, + "learning_rate": 4.932145025225535e-06, + "loss": 0.7546, + "step": 140 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 2.6492230892181396, + "learning_rate": 4.931169703639282e-06, + "loss": 0.8797, + "step": 141 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 4.130539417266846, + "learning_rate": 4.930187520353363e-06, + "loss": 0.865, + "step": 142 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 2.6537978649139404, + "learning_rate": 4.929198478139877e-06, + "loss": 0.6901, + "step": 143 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 2.488971710205078, + "learning_rate": 4.928202579790285e-06, + "loss": 0.5932, + "step": 144 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 2.4585540294647217, + "learning_rate": 4.927199828115395e-06, + "loss": 0.7742, + "step": 145 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 2.5525095462799072, + "learning_rate": 4.9261902259453616e-06, + "loss": 0.8475, + "step": 146 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 3.032649040222168, + "learning_rate": 4.925173776129669e-06, + "loss": 1.0514, + "step": 147 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 2.4535398483276367, + "learning_rate": 4.9241504815371346e-06, + "loss": 0.5964, + "step": 148 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 2.2060890197753906, + "learning_rate": 4.923120345055887e-06, + "loss": 0.7615, + "step": 149 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 3.0113794803619385, + "learning_rate": 4.922083369593372e-06, + "loss": 0.6908, + "step": 150 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 2.6805336475372314, + "learning_rate": 4.921039558076335e-06, + "loss": 0.8661, + "step": 151 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 3.562213897705078, + "learning_rate": 4.919988913450812e-06, + "loss": 0.5267, + "step": 152 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 3.3453261852264404, + "learning_rate": 4.918931438682132e-06, + "loss": 0.9222, + "step": 153 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 2.7286977767944336, + "learning_rate": 4.917867136754894e-06, + "loss": 0.8865, + "step": 154 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 2.263981819152832, + "learning_rate": 4.916796010672969e-06, + "loss": 0.7262, + "step": 155 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 2.273568630218506, + "learning_rate": 4.91571806345949e-06, + "loss": 0.7611, + "step": 156 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 3.0288827419281006, + "learning_rate": 4.91463329815684e-06, + "loss": 0.8745, + "step": 157 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 2.3675708770751953, + "learning_rate": 4.913541717826645e-06, + "loss": 0.6164, + "step": 158 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 2.2979559898376465, + "learning_rate": 4.912443325549767e-06, + "loss": 0.5549, + "step": 159 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 6.2421064376831055, + "learning_rate": 4.911338124426291e-06, + "loss": 0.9052, + "step": 160 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 2.125546932220459, + "learning_rate": 4.910226117575525e-06, + "loss": 0.7989, + "step": 161 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 2.8069941997528076, + "learning_rate": 4.909107308135978e-06, + "loss": 0.5915, + "step": 162 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 2.9329476356506348, + "learning_rate": 4.907981699265364e-06, + "loss": 0.6593, + "step": 163 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 3.8588013648986816, + "learning_rate": 4.906849294140587e-06, + "loss": 0.8739, + "step": 164 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 3.3252463340759277, + "learning_rate": 4.9057100959577285e-06, + "loss": 0.7314, + "step": 165 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 3.051591634750366, + "learning_rate": 4.904564107932048e-06, + "loss": 1.0109, + "step": 166 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 2.8550548553466797, + "learning_rate": 4.903411333297966e-06, + "loss": 0.9092, + "step": 167 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 2.8500938415527344, + "learning_rate": 4.902251775309057e-06, + "loss": 0.7922, + "step": 168 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 3.3096566200256348, + "learning_rate": 4.901085437238041e-06, + "loss": 0.5955, + "step": 169 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 2.7365124225616455, + "learning_rate": 4.899912322376776e-06, + "loss": 1.0019, + "step": 170 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 2.3542861938476562, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.8508, + "step": 171 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 2.822413921356201, + "learning_rate": 4.897545775546545e-06, + "loss": 0.8514, + "step": 172 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 2.528853416442871, + "learning_rate": 4.8963523502568886e-06, + "loss": 1.0263, + "step": 173 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 3.7086899280548096, + "learning_rate": 4.895152161535582e-06, + "loss": 0.7929, + "step": 174 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 2.407613515853882, + "learning_rate": 4.893945212770019e-06, + "loss": 0.7227, + "step": 175 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 2.629978656768799, + "learning_rate": 4.892731507366678e-06, + "loss": 0.8923, + "step": 176 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 2.281735897064209, + "learning_rate": 4.891511048751102e-06, + "loss": 0.7475, + "step": 177 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 2.8144044876098633, + "learning_rate": 4.890283840367898e-06, + "loss": 1.1405, + "step": 178 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 3.9945294857025146, + "learning_rate": 4.889049885680721e-06, + "loss": 0.8524, + "step": 179 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 2.9770278930664062, + "learning_rate": 4.887809188172268e-06, + "loss": 0.7617, + "step": 180 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 2.9451241493225098, + "learning_rate": 4.886561751344266e-06, + "loss": 0.8514, + "step": 181 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 2.670421600341797, + "learning_rate": 4.885307578717464e-06, + "loss": 0.8335, + "step": 182 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 2.565976858139038, + "learning_rate": 4.8840466738316216e-06, + "loss": 0.831, + "step": 183 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 2.5326290130615234, + "learning_rate": 4.882779040245499e-06, + "loss": 0.7891, + "step": 184 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 2.524470090866089, + "learning_rate": 4.881504681536847e-06, + "loss": 0.6257, + "step": 185 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 2.3305137157440186, + "learning_rate": 4.880223601302398e-06, + "loss": 0.6008, + "step": 186 + }, + { + "epoch": 1.0, + "grad_norm": 3.0916237831115723, + "learning_rate": 4.878935803157856e-06, + "loss": 0.6061, + "step": 187 + } + ], + "logging_steps": 1, + "max_steps": 1870, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.976503998434509e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja b/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6fe9f22d589fea1b113374cd4672ce50cc34304 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json @@ -0,0 +1,2652 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0053475935828877, + "grad_norm": 5.576223850250244, + "learning_rate": 5e-06, + "loss": 0.9394, + "step": 1 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 41.748443603515625, + "learning_rate": 4.99999647201733e-06, + "loss": 2.0122, + "step": 2 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 10.106061935424805, + "learning_rate": 4.999985888079276e-06, + "loss": 1.0092, + "step": 3 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 12.377921104431152, + "learning_rate": 4.999968248215712e-06, + "loss": 1.5196, + "step": 4 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 7.258418560028076, + "learning_rate": 4.999943552476422e-06, + "loss": 1.4586, + "step": 5 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 5.282329559326172, + "learning_rate": 4.999911800931108e-06, + "loss": 1.1068, + "step": 6 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 3.468794822692871, + "learning_rate": 4.999872993669387e-06, + "loss": 0.8997, + "step": 7 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 3.2200160026550293, + "learning_rate": 4.999827130800785e-06, + "loss": 1.075, + "step": 8 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 21.746450424194336, + "learning_rate": 4.999774212454746e-06, + "loss": 1.691, + "step": 9 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 11.011313438415527, + "learning_rate": 4.999714238780626e-06, + "loss": 1.3167, + "step": 10 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 5.002156734466553, + "learning_rate": 4.999647209947694e-06, + "loss": 0.9653, + "step": 11 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 12.305068016052246, + "learning_rate": 4.999573126145132e-06, + "loss": 1.2992, + "step": 12 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 5.660033702850342, + "learning_rate": 4.999491987582032e-06, + "loss": 0.9204, + "step": 13 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 5.366727828979492, + "learning_rate": 4.999403794487399e-06, + "loss": 1.3307, + "step": 14 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 3.265700578689575, + "learning_rate": 4.999308547110147e-06, + "loss": 0.8596, + "step": 15 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 3.0776281356811523, + "learning_rate": 4.9992062457191005e-06, + "loss": 0.9614, + "step": 16 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.8679943084716797, + "learning_rate": 4.999096890602996e-06, + "loss": 0.8, + "step": 17 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.785064935684204, + "learning_rate": 4.998980482070473e-06, + "loss": 0.7683, + "step": 18 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 4.619974613189697, + "learning_rate": 4.998857020450084e-06, + "loss": 1.2742, + "step": 19 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 2.894366979598999, + "learning_rate": 4.998726506090283e-06, + "loss": 0.8559, + "step": 20 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 3.4240734577178955, + "learning_rate": 4.998588939359435e-06, + "loss": 0.8223, + "step": 21 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 4.5151777267456055, + "learning_rate": 4.998444320645803e-06, + "loss": 1.1229, + "step": 22 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 2.7780518531799316, + "learning_rate": 4.998292650357558e-06, + "loss": 0.8936, + "step": 23 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 3.7252511978149414, + "learning_rate": 4.998133928922773e-06, + "loss": 1.2552, + "step": 24 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 4.296158313751221, + "learning_rate": 4.99796815678942e-06, + "loss": 0.7075, + "step": 25 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 5.5546956062316895, + "learning_rate": 4.997795334425372e-06, + "loss": 0.9781, + "step": 26 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 3.685818910598755, + "learning_rate": 4.997615462318403e-06, + "loss": 1.0657, + "step": 27 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 3.5500221252441406, + "learning_rate": 4.997428540976177e-06, + "loss": 0.951, + "step": 28 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 13.312395095825195, + "learning_rate": 4.997234570926263e-06, + "loss": 0.6788, + "step": 29 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 2.6344847679138184, + "learning_rate": 4.997033552716116e-06, + "loss": 0.8, + "step": 30 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 3.0757298469543457, + "learning_rate": 4.9968254869130885e-06, + "loss": 0.7625, + "step": 31 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 4.064891815185547, + "learning_rate": 4.996610374104422e-06, + "loss": 0.7381, + "step": 32 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 7.529796123504639, + "learning_rate": 4.9963882148972475e-06, + "loss": 1.3283, + "step": 33 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 3.2115354537963867, + "learning_rate": 4.996159009918586e-06, + "loss": 1.0002, + "step": 34 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 4.122320652008057, + "learning_rate": 4.9959227598153395e-06, + "loss": 0.9095, + "step": 35 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 54.98562240600586, + "learning_rate": 4.9956794652542994e-06, + "loss": 1.2191, + "step": 36 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 3.083123207092285, + "learning_rate": 4.9954291269221364e-06, + "loss": 0.7424, + "step": 37 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 15.99591064453125, + "learning_rate": 4.995171745525401e-06, + "loss": 0.9289, + "step": 38 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 5.214310169219971, + "learning_rate": 4.994907321790524e-06, + "loss": 0.991, + "step": 39 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 3.4376749992370605, + "learning_rate": 4.994635856463811e-06, + "loss": 0.6406, + "step": 40 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 4.30764102935791, + "learning_rate": 4.994357350311441e-06, + "loss": 1.2038, + "step": 41 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 3.5810039043426514, + "learning_rate": 4.994071804119467e-06, + "loss": 0.9696, + "step": 42 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 4.080881595611572, + "learning_rate": 4.993779218693811e-06, + "loss": 1.1579, + "step": 43 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 3.1389286518096924, + "learning_rate": 4.99347959486026e-06, + "loss": 0.7118, + "step": 44 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 2.6397321224212646, + "learning_rate": 4.99317293346447e-06, + "loss": 0.7579, + "step": 45 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 2.9469995498657227, + "learning_rate": 4.992859235371958e-06, + "loss": 0.7105, + "step": 46 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 2.669086456298828, + "learning_rate": 4.992538501468101e-06, + "loss": 0.6812, + "step": 47 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 5.421566009521484, + "learning_rate": 4.992210732658132e-06, + "loss": 0.9733, + "step": 48 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 4.413289546966553, + "learning_rate": 4.991875929867143e-06, + "loss": 1.1301, + "step": 49 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 3.1602351665496826, + "learning_rate": 4.991534094040077e-06, + "loss": 0.6706, + "step": 50 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 4.374372959136963, + "learning_rate": 4.991185226141726e-06, + "loss": 0.9462, + "step": 51 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 2.9649057388305664, + "learning_rate": 4.990829327156729e-06, + "loss": 1.0714, + "step": 52 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 3.1991283893585205, + "learning_rate": 4.990466398089571e-06, + "loss": 0.9175, + "step": 53 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 2.580082654953003, + "learning_rate": 4.99009643996458e-06, + "loss": 0.5164, + "step": 54 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 3.9115707874298096, + "learning_rate": 4.989719453825918e-06, + "loss": 0.7223, + "step": 55 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 2.825481414794922, + "learning_rate": 4.989335440737587e-06, + "loss": 0.7065, + "step": 56 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 2.8599696159362793, + "learning_rate": 4.9889444017834185e-06, + "loss": 0.8833, + "step": 57 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 2.885662078857422, + "learning_rate": 4.988546338067078e-06, + "loss": 0.8664, + "step": 58 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 3.187185764312744, + "learning_rate": 4.988141250712053e-06, + "loss": 0.884, + "step": 59 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 3.7545692920684814, + "learning_rate": 4.987729140861657e-06, + "loss": 0.9891, + "step": 60 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 3.0581002235412598, + "learning_rate": 4.987310009679023e-06, + "loss": 0.8838, + "step": 61 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 2.8039402961730957, + "learning_rate": 4.986883858347101e-06, + "loss": 0.8188, + "step": 62 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 3.01231050491333, + "learning_rate": 4.986450688068655e-06, + "loss": 0.6032, + "step": 63 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 2.7969677448272705, + "learning_rate": 4.986010500066258e-06, + "loss": 0.7623, + "step": 64 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 2.757786989212036, + "learning_rate": 4.985563295582292e-06, + "loss": 0.8051, + "step": 65 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.9582571983337402, + "learning_rate": 4.98510907587894e-06, + "loss": 0.7901, + "step": 66 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 3.104294776916504, + "learning_rate": 4.984647842238185e-06, + "loss": 1.0582, + "step": 67 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 2.7413785457611084, + "learning_rate": 4.984179595961806e-06, + "loss": 0.5912, + "step": 68 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 2.722858190536499, + "learning_rate": 4.983704338371375e-06, + "loss": 0.7855, + "step": 69 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 2.5095815658569336, + "learning_rate": 4.983222070808255e-06, + "loss": 0.6491, + "step": 70 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 2.97511887550354, + "learning_rate": 4.982732794633588e-06, + "loss": 0.9735, + "step": 71 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 3.5139546394348145, + "learning_rate": 4.982236511228301e-06, + "loss": 0.8495, + "step": 72 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 3.086568593978882, + "learning_rate": 4.981733221993099e-06, + "loss": 1.0891, + "step": 73 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 3.490666389465332, + "learning_rate": 4.981222928348456e-06, + "loss": 0.8013, + "step": 74 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 3.3275415897369385, + "learning_rate": 4.98070563173462e-06, + "loss": 0.8298, + "step": 75 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 2.7193403244018555, + "learning_rate": 4.980181333611601e-06, + "loss": 0.6989, + "step": 76 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 2.5338242053985596, + "learning_rate": 4.979650035459171e-06, + "loss": 0.6769, + "step": 77 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 2.7369015216827393, + "learning_rate": 4.9791117387768575e-06, + "loss": 1.0385, + "step": 78 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 2.6109988689422607, + "learning_rate": 4.978566445083942e-06, + "loss": 0.6498, + "step": 79 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 3.0895354747772217, + "learning_rate": 4.978014155919455e-06, + "loss": 0.7931, + "step": 80 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 2.6197807788848877, + "learning_rate": 4.977454872842169e-06, + "loss": 0.7322, + "step": 81 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 2.9248461723327637, + "learning_rate": 4.976888597430597e-06, + "loss": 0.9184, + "step": 82 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 2.7636630535125732, + "learning_rate": 4.976315331282985e-06, + "loss": 0.8258, + "step": 83 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 2.702061653137207, + "learning_rate": 4.9757350760173144e-06, + "loss": 0.7414, + "step": 84 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 2.3016257286071777, + "learning_rate": 4.975147833271288e-06, + "loss": 0.8573, + "step": 85 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 2.758795738220215, + "learning_rate": 4.974553604702332e-06, + "loss": 0.7271, + "step": 86 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 3.0134952068328857, + "learning_rate": 4.973952391987589e-06, + "loss": 0.8976, + "step": 87 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.668630838394165, + "learning_rate": 4.9733441968239125e-06, + "loss": 1.0753, + "step": 88 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 2.5940303802490234, + "learning_rate": 4.972729020927866e-06, + "loss": 0.6903, + "step": 89 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 3.0423827171325684, + "learning_rate": 4.97210686603571e-06, + "loss": 0.9347, + "step": 90 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 2.5026450157165527, + "learning_rate": 4.97147773390341e-06, + "loss": 0.6738, + "step": 91 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 3.596545457839966, + "learning_rate": 4.970841626306617e-06, + "loss": 0.8356, + "step": 92 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 3.2207071781158447, + "learning_rate": 4.970198545040673e-06, + "loss": 0.9117, + "step": 93 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 2.858541965484619, + "learning_rate": 4.969548491920603e-06, + "loss": 0.8237, + "step": 94 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 2.896359920501709, + "learning_rate": 4.968891468781105e-06, + "loss": 0.8775, + "step": 95 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 3.6659083366394043, + "learning_rate": 4.968227477476554e-06, + "loss": 0.9068, + "step": 96 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 3.6469972133636475, + "learning_rate": 4.9675565198809905e-06, + "loss": 1.0435, + "step": 97 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 4.615362167358398, + "learning_rate": 4.966878597888114e-06, + "loss": 1.0084, + "step": 98 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 3.4075334072113037, + "learning_rate": 4.966193713411284e-06, + "loss": 0.7217, + "step": 99 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.965501868383507e-06, + "loss": 0.6594, + "step": 100 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 4.086977958679199, + "learning_rate": 4.964803064757438e-06, + "loss": 0.9249, + "step": 101 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.676903247833252, + "learning_rate": 4.964097304505371e-06, + "loss": 0.7776, + "step": 102 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 2.5098068714141846, + "learning_rate": 4.963384589619233e-06, + "loss": 0.6339, + "step": 103 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 4.064920902252197, + "learning_rate": 4.962664922110581e-06, + "loss": 1.0107, + "step": 104 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 2.6229960918426514, + "learning_rate": 4.9619383040105954e-06, + "loss": 1.0052, + "step": 105 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 2.857506275177002, + "learning_rate": 4.961204737370071e-06, + "loss": 0.8577, + "step": 106 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 3.9176764488220215, + "learning_rate": 4.960464224259418e-06, + "loss": 1.1237, + "step": 107 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 2.9063003063201904, + "learning_rate": 4.95971676676865e-06, + "loss": 0.6237, + "step": 108 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 3.1583969593048096, + "learning_rate": 4.958962367007381e-06, + "loss": 0.9135, + "step": 109 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.7559218406677246, + "learning_rate": 4.958201027104818e-06, + "loss": 0.7461, + "step": 110 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 11.086910247802734, + "learning_rate": 4.957432749209755e-06, + "loss": 0.69, + "step": 111 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 3.8109939098358154, + "learning_rate": 4.95665753549057e-06, + "loss": 0.8578, + "step": 112 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 3.3317348957061768, + "learning_rate": 4.9558753881352165e-06, + "loss": 1.3098, + "step": 113 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 2.715823173522949, + "learning_rate": 4.955086309351213e-06, + "loss": 0.9979, + "step": 114 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 2.798602819442749, + "learning_rate": 4.9542903013656485e-06, + "loss": 0.6298, + "step": 115 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 32.90562438964844, + "learning_rate": 4.953487366425163e-06, + "loss": 0.959, + "step": 116 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 4.012441158294678, + "learning_rate": 4.952677506795949e-06, + "loss": 0.6791, + "step": 117 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 3.548151731491089, + "learning_rate": 4.951860724763743e-06, + "loss": 0.7783, + "step": 118 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 3.4778249263763428, + "learning_rate": 4.95103702263382e-06, + "loss": 0.8085, + "step": 119 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 2.625532627105713, + "learning_rate": 4.950206402730984e-06, + "loss": 0.7702, + "step": 120 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 3.2743935585021973, + "learning_rate": 4.949368867399567e-06, + "loss": 0.602, + "step": 121 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 3.9576094150543213, + "learning_rate": 4.948524419003415e-06, + "loss": 1.2858, + "step": 122 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 3.233257532119751, + "learning_rate": 4.947673059925889e-06, + "loss": 0.7945, + "step": 123 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 2.6730406284332275, + "learning_rate": 4.9468147925698525e-06, + "loss": 0.959, + "step": 124 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 2.8612916469573975, + "learning_rate": 4.945949619357668e-06, + "loss": 0.7611, + "step": 125 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 2.9609551429748535, + "learning_rate": 4.945077542731188e-06, + "loss": 0.5753, + "step": 126 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 3.7842485904693604, + "learning_rate": 4.94419856515175e-06, + "loss": 0.8995, + "step": 127 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 3.513170003890991, + "learning_rate": 4.943312689100166e-06, + "loss": 0.9623, + "step": 128 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 2.690305471420288, + "learning_rate": 4.942419917076723e-06, + "loss": 0.6657, + "step": 129 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 2.951237440109253, + "learning_rate": 4.941520251601167e-06, + "loss": 0.7711, + "step": 130 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 2.8285868167877197, + "learning_rate": 4.940613695212702e-06, + "loss": 0.5908, + "step": 131 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.6700541973114014, + "learning_rate": 4.939700250469979e-06, + "loss": 0.967, + "step": 132 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 3.229152202606201, + "learning_rate": 4.938779919951092e-06, + "loss": 0.9519, + "step": 133 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 2.403944730758667, + "learning_rate": 4.93785270625357e-06, + "loss": 0.5873, + "step": 134 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 3.8491666316986084, + "learning_rate": 4.936918611994368e-06, + "loss": 0.8148, + "step": 135 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.8255743980407715, + "learning_rate": 4.935977639809861e-06, + "loss": 0.8286, + "step": 136 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 2.8479511737823486, + "learning_rate": 4.935029792355834e-06, + "loss": 0.6442, + "step": 137 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 2.585566759109497, + "learning_rate": 4.934075072307481e-06, + "loss": 1.0144, + "step": 138 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 2.8108413219451904, + "learning_rate": 4.933113482359388e-06, + "loss": 0.5922, + "step": 139 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 2.799546241760254, + "learning_rate": 4.932145025225535e-06, + "loss": 0.7546, + "step": 140 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 2.6492230892181396, + "learning_rate": 4.931169703639282e-06, + "loss": 0.8797, + "step": 141 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 4.130539417266846, + "learning_rate": 4.930187520353363e-06, + "loss": 0.865, + "step": 142 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 2.6537978649139404, + "learning_rate": 4.929198478139877e-06, + "loss": 0.6901, + "step": 143 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 2.488971710205078, + "learning_rate": 4.928202579790285e-06, + "loss": 0.5932, + "step": 144 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 2.4585540294647217, + "learning_rate": 4.927199828115395e-06, + "loss": 0.7742, + "step": 145 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 2.5525095462799072, + "learning_rate": 4.9261902259453616e-06, + "loss": 0.8475, + "step": 146 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 3.032649040222168, + "learning_rate": 4.925173776129669e-06, + "loss": 1.0514, + "step": 147 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 2.4535398483276367, + "learning_rate": 4.9241504815371346e-06, + "loss": 0.5964, + "step": 148 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 2.2060890197753906, + "learning_rate": 4.923120345055887e-06, + "loss": 0.7615, + "step": 149 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 3.0113794803619385, + "learning_rate": 4.922083369593372e-06, + "loss": 0.6908, + "step": 150 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 2.6805336475372314, + "learning_rate": 4.921039558076335e-06, + "loss": 0.8661, + "step": 151 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 3.562213897705078, + "learning_rate": 4.919988913450812e-06, + "loss": 0.5267, + "step": 152 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 3.3453261852264404, + "learning_rate": 4.918931438682132e-06, + "loss": 0.9222, + "step": 153 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 2.7286977767944336, + "learning_rate": 4.917867136754894e-06, + "loss": 0.8865, + "step": 154 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 2.263981819152832, + "learning_rate": 4.916796010672969e-06, + "loss": 0.7262, + "step": 155 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 2.273568630218506, + "learning_rate": 4.91571806345949e-06, + "loss": 0.7611, + "step": 156 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 3.0288827419281006, + "learning_rate": 4.91463329815684e-06, + "loss": 0.8745, + "step": 157 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 2.3675708770751953, + "learning_rate": 4.913541717826645e-06, + "loss": 0.6164, + "step": 158 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 2.2979559898376465, + "learning_rate": 4.912443325549767e-06, + "loss": 0.5549, + "step": 159 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 6.2421064376831055, + "learning_rate": 4.911338124426291e-06, + "loss": 0.9052, + "step": 160 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 2.125546932220459, + "learning_rate": 4.910226117575525e-06, + "loss": 0.7989, + "step": 161 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 2.8069941997528076, + "learning_rate": 4.909107308135978e-06, + "loss": 0.5915, + "step": 162 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 2.9329476356506348, + "learning_rate": 4.907981699265364e-06, + "loss": 0.6593, + "step": 163 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 3.8588013648986816, + "learning_rate": 4.906849294140587e-06, + "loss": 0.8739, + "step": 164 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 3.3252463340759277, + "learning_rate": 4.9057100959577285e-06, + "loss": 0.7314, + "step": 165 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 3.051591634750366, + "learning_rate": 4.904564107932048e-06, + "loss": 1.0109, + "step": 166 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 2.8550548553466797, + "learning_rate": 4.903411333297966e-06, + "loss": 0.9092, + "step": 167 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 2.8500938415527344, + "learning_rate": 4.902251775309057e-06, + "loss": 0.7922, + "step": 168 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 3.3096566200256348, + "learning_rate": 4.901085437238041e-06, + "loss": 0.5955, + "step": 169 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 2.7365124225616455, + "learning_rate": 4.899912322376776e-06, + "loss": 1.0019, + "step": 170 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 2.3542861938476562, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.8508, + "step": 171 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 2.822413921356201, + "learning_rate": 4.897545775546545e-06, + "loss": 0.8514, + "step": 172 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 2.528853416442871, + "learning_rate": 4.8963523502568886e-06, + "loss": 1.0263, + "step": 173 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 3.7086899280548096, + "learning_rate": 4.895152161535582e-06, + "loss": 0.7929, + "step": 174 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 2.407613515853882, + "learning_rate": 4.893945212770019e-06, + "loss": 0.7227, + "step": 175 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 2.629978656768799, + "learning_rate": 4.892731507366678e-06, + "loss": 0.8923, + "step": 176 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 2.281735897064209, + "learning_rate": 4.891511048751102e-06, + "loss": 0.7475, + "step": 177 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 2.8144044876098633, + "learning_rate": 4.890283840367898e-06, + "loss": 1.1405, + "step": 178 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 3.9945294857025146, + "learning_rate": 4.889049885680721e-06, + "loss": 0.8524, + "step": 179 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 2.9770278930664062, + "learning_rate": 4.887809188172268e-06, + "loss": 0.7617, + "step": 180 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 2.9451241493225098, + "learning_rate": 4.886561751344266e-06, + "loss": 0.8514, + "step": 181 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 2.670421600341797, + "learning_rate": 4.885307578717464e-06, + "loss": 0.8335, + "step": 182 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 2.565976858139038, + "learning_rate": 4.8840466738316216e-06, + "loss": 0.831, + "step": 183 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 2.5326290130615234, + "learning_rate": 4.882779040245499e-06, + "loss": 0.7891, + "step": 184 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 2.524470090866089, + "learning_rate": 4.881504681536847e-06, + "loss": 0.6257, + "step": 185 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 2.3305137157440186, + "learning_rate": 4.880223601302398e-06, + "loss": 0.6008, + "step": 186 + }, + { + "epoch": 1.0, + "grad_norm": 3.0916237831115723, + "learning_rate": 4.878935803157856e-06, + "loss": 0.6061, + "step": 187 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 3.003761053085327, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6628, + "step": 188 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 2.674351692199707, + "learning_rate": 4.876340067696097e-06, + "loss": 0.6124, + "step": 189 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 3.9263675212860107, + "learning_rate": 4.875032137705047e-06, + "loss": 0.7186, + "step": 190 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 3.006312370300293, + "learning_rate": 4.873717504456219e-06, + "loss": 0.7723, + "step": 191 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 2.5927529335021973, + "learning_rate": 4.872396171660014e-06, + "loss": 0.4069, + "step": 192 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 3.193277597427368, + "learning_rate": 4.8710681430457466e-06, + "loss": 0.6705, + "step": 193 + }, + { + "epoch": 1.0374331550802138, + "grad_norm": 4.224829196929932, + "learning_rate": 4.8697334223616226e-06, + "loss": 0.8276, + "step": 194 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 3.008603096008301, + "learning_rate": 4.8683920133747405e-06, + "loss": 0.5913, + "step": 195 + }, + { + "epoch": 1.0481283422459893, + "grad_norm": 2.7365758419036865, + "learning_rate": 4.867043919871076e-06, + "loss": 0.5244, + "step": 196 + }, + { + "epoch": 1.053475935828877, + "grad_norm": 3.109424352645874, + "learning_rate": 4.865689145655467e-06, + "loss": 0.5962, + "step": 197 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 2.6860733032226562, + "learning_rate": 4.864327694551612e-06, + "loss": 0.5601, + "step": 198 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 3.3604085445404053, + "learning_rate": 4.86295957040205e-06, + "loss": 0.953, + "step": 199 + }, + { + "epoch": 1.0695187165775402, + "grad_norm": 3.981157064437866, + "learning_rate": 4.861584777068154e-06, + "loss": 0.7394, + "step": 200 + }, + { + "epoch": 1.0748663101604279, + "grad_norm": 3.687598943710327, + "learning_rate": 4.860203318430126e-06, + "loss": 0.3851, + "step": 201 + }, + { + "epoch": 1.0802139037433156, + "grad_norm": 2.9157185554504395, + "learning_rate": 4.858815198386973e-06, + "loss": 0.6595, + "step": 202 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 2.805755853652954, + "learning_rate": 4.8574204208565056e-06, + "loss": 0.5308, + "step": 203 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 2.6051762104034424, + "learning_rate": 4.856018989775326e-06, + "loss": 0.5401, + "step": 204 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 2.8916220664978027, + "learning_rate": 4.854610909098813e-06, + "loss": 0.6365, + "step": 205 + }, + { + "epoch": 1.1016042780748663, + "grad_norm": 3.389765977859497, + "learning_rate": 4.853196182801112e-06, + "loss": 1.0949, + "step": 206 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 2.910980701446533, + "learning_rate": 4.851774814875131e-06, + "loss": 0.6629, + "step": 207 + }, + { + "epoch": 1.1122994652406417, + "grad_norm": 2.8479011058807373, + "learning_rate": 4.850346809332515e-06, + "loss": 0.7166, + "step": 208 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 2.242565155029297, + "learning_rate": 4.8489121702036515e-06, + "loss": 0.7077, + "step": 209 + }, + { + "epoch": 1.1229946524064172, + "grad_norm": 2.833369731903076, + "learning_rate": 4.847470901537642e-06, + "loss": 0.6319, + "step": 210 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 3.053952217102051, + "learning_rate": 4.846023007402305e-06, + "loss": 0.5327, + "step": 211 + }, + { + "epoch": 1.1336898395721926, + "grad_norm": 3.0862181186676025, + "learning_rate": 4.844568491884156e-06, + "loss": 0.414, + "step": 212 + }, + { + "epoch": 1.1390374331550803, + "grad_norm": 2.6374268531799316, + "learning_rate": 4.843107359088402e-06, + "loss": 0.5933, + "step": 213 + }, + { + "epoch": 1.1443850267379678, + "grad_norm": 8.499526023864746, + "learning_rate": 4.84163961313892e-06, + "loss": 0.6844, + "step": 214 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 2.2556655406951904, + "learning_rate": 4.840165258178259e-06, + "loss": 0.5242, + "step": 215 + }, + { + "epoch": 1.1550802139037433, + "grad_norm": 2.8057925701141357, + "learning_rate": 4.838684298367616e-06, + "loss": 0.747, + "step": 216 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 2.6920077800750732, + "learning_rate": 4.837196737886834e-06, + "loss": 0.7602, + "step": 217 + }, + { + "epoch": 1.1657754010695187, + "grad_norm": 3.1757941246032715, + "learning_rate": 4.83570258093438e-06, + "loss": 0.7525, + "step": 218 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 2.869535446166992, + "learning_rate": 4.834201831727343e-06, + "loss": 0.5111, + "step": 219 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 2.853529930114746, + "learning_rate": 4.832694494501417e-06, + "loss": 0.6215, + "step": 220 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 2.854609727859497, + "learning_rate": 4.83118057351089e-06, + "loss": 0.3931, + "step": 221 + }, + { + "epoch": 1.1871657754010696, + "grad_norm": 3.3581626415252686, + "learning_rate": 4.829660073028631e-06, + "loss": 0.6418, + "step": 222 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 3.3372135162353516, + "learning_rate": 4.82813299734608e-06, + "loss": 0.5028, + "step": 223 + }, + { + "epoch": 1.1978609625668448, + "grad_norm": 3.1315996646881104, + "learning_rate": 4.826599350773234e-06, + "loss": 0.4452, + "step": 224 + }, + { + "epoch": 1.2032085561497325, + "grad_norm": 2.9624111652374268, + "learning_rate": 4.825059137638636e-06, + "loss": 0.7803, + "step": 225 + }, + { + "epoch": 1.2085561497326203, + "grad_norm": 3.0918056964874268, + "learning_rate": 4.823512362289362e-06, + "loss": 0.5968, + "step": 226 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 2.905611276626587, + "learning_rate": 4.821959029091009e-06, + "loss": 0.5724, + "step": 227 + }, + { + "epoch": 1.2192513368983957, + "grad_norm": 2.967761278152466, + "learning_rate": 4.820399142427684e-06, + "loss": 0.5357, + "step": 228 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 3.3968875408172607, + "learning_rate": 4.818832706701989e-06, + "loss": 0.5743, + "step": 229 + }, + { + "epoch": 1.2299465240641712, + "grad_norm": 3.2088563442230225, + "learning_rate": 4.817259726335009e-06, + "loss": 0.8447, + "step": 230 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 2.8846428394317627, + "learning_rate": 4.815680205766304e-06, + "loss": 0.8136, + "step": 231 + }, + { + "epoch": 1.2406417112299466, + "grad_norm": 2.198012351989746, + "learning_rate": 4.814094149453891e-06, + "loss": 0.4073, + "step": 232 + }, + { + "epoch": 1.2459893048128343, + "grad_norm": 3.148988962173462, + "learning_rate": 4.812501561874232e-06, + "loss": 0.6625, + "step": 233 + }, + { + "epoch": 1.251336898395722, + "grad_norm": 2.770563840866089, + "learning_rate": 4.8109024475222255e-06, + "loss": 0.6403, + "step": 234 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 3.157482147216797, + "learning_rate": 4.809296810911188e-06, + "loss": 0.8436, + "step": 235 + }, + { + "epoch": 1.2620320855614973, + "grad_norm": 3.0236425399780273, + "learning_rate": 4.8076846565728475e-06, + "loss": 0.8578, + "step": 236 + }, + { + "epoch": 1.267379679144385, + "grad_norm": 2.508145570755005, + "learning_rate": 4.806065989057326e-06, + "loss": 0.5431, + "step": 237 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 3.171482563018799, + "learning_rate": 4.8044408129331266e-06, + "loss": 0.4613, + "step": 238 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 3.209517240524292, + "learning_rate": 4.802809132787125e-06, + "loss": 0.6743, + "step": 239 + }, + { + "epoch": 1.2834224598930482, + "grad_norm": 2.8249428272247314, + "learning_rate": 4.801170953224554e-06, + "loss": 0.8116, + "step": 240 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 2.2719192504882812, + "learning_rate": 4.7995262788689865e-06, + "loss": 0.4008, + "step": 241 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 3.2883615493774414, + "learning_rate": 4.797875114362331e-06, + "loss": 0.5953, + "step": 242 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 17.095844268798828, + "learning_rate": 4.796217464364808e-06, + "loss": 0.8779, + "step": 243 + }, + { + "epoch": 1.3048128342245988, + "grad_norm": 3.6116573810577393, + "learning_rate": 4.794553333554949e-06, + "loss": 0.7568, + "step": 244 + }, + { + "epoch": 1.3101604278074865, + "grad_norm": 2.622695207595825, + "learning_rate": 4.792882726629572e-06, + "loss": 0.5016, + "step": 245 + }, + { + "epoch": 1.3155080213903743, + "grad_norm": 8.820343017578125, + "learning_rate": 4.791205648303775e-06, + "loss": 0.8415, + "step": 246 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 2.8980658054351807, + "learning_rate": 4.789522103310922e-06, + "loss": 0.6032, + "step": 247 + }, + { + "epoch": 1.3262032085561497, + "grad_norm": 2.6704914569854736, + "learning_rate": 4.787832096402626e-06, + "loss": 0.6548, + "step": 248 + }, + { + "epoch": 1.3315508021390374, + "grad_norm": 3.3483593463897705, + "learning_rate": 4.786135632348738e-06, + "loss": 0.6212, + "step": 249 + }, + { + "epoch": 1.3368983957219251, + "grad_norm": 2.6832988262176514, + "learning_rate": 4.7844327159373365e-06, + "loss": 0.8052, + "step": 250 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 2.599897623062134, + "learning_rate": 4.782723351974708e-06, + "loss": 0.589, + "step": 251 + }, + { + "epoch": 1.3475935828877006, + "grad_norm": 3.2921037673950195, + "learning_rate": 4.7810075452853385e-06, + "loss": 0.63, + "step": 252 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 2.5389389991760254, + "learning_rate": 4.779285300711897e-06, + "loss": 0.6727, + "step": 253 + }, + { + "epoch": 1.358288770053476, + "grad_norm": 2.817018985748291, + "learning_rate": 4.7775566231152216e-06, + "loss": 0.4158, + "step": 254 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 2.749091863632202, + "learning_rate": 4.775821517374308e-06, + "loss": 0.8809, + "step": 255 + }, + { + "epoch": 1.3689839572192513, + "grad_norm": 2.599484443664551, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.5157, + "step": 256 + }, + { + "epoch": 1.374331550802139, + "grad_norm": 2.412386417388916, + "learning_rate": 4.772332041066452e-06, + "loss": 0.4467, + "step": 257 + }, + { + "epoch": 1.3796791443850267, + "grad_norm": 2.713000774383545, + "learning_rate": 4.770577680348159e-06, + "loss": 0.9125, + "step": 258 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 3.22122859954834, + "learning_rate": 4.768816911182899e-06, + "loss": 0.4665, + "step": 259 + }, + { + "epoch": 1.3903743315508021, + "grad_norm": 2.9274754524230957, + "learning_rate": 4.767049738540244e-06, + "loss": 0.5404, + "step": 260 + }, + { + "epoch": 1.3957219251336899, + "grad_norm": 2.2020022869110107, + "learning_rate": 4.765276167407836e-06, + "loss": 0.4575, + "step": 261 + }, + { + "epoch": 1.4010695187165776, + "grad_norm": 3.0807480812072754, + "learning_rate": 4.7634962027913784e-06, + "loss": 0.8227, + "step": 262 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 2.655407667160034, + "learning_rate": 4.761709849714619e-06, + "loss": 0.5813, + "step": 263 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 2.580695152282715, + "learning_rate": 4.7599171132193355e-06, + "loss": 0.6333, + "step": 264 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 2.8121836185455322, + "learning_rate": 4.7581179983653224e-06, + "loss": 0.6368, + "step": 265 + }, + { + "epoch": 1.4224598930481283, + "grad_norm": 3.2582831382751465, + "learning_rate": 4.756312510230377e-06, + "loss": 0.4146, + "step": 266 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 3.0589146614074707, + "learning_rate": 4.754500653910284e-06, + "loss": 0.6066, + "step": 267 + }, + { + "epoch": 1.4331550802139037, + "grad_norm": 3.0196666717529297, + "learning_rate": 4.752682434518801e-06, + "loss": 0.6254, + "step": 268 + }, + { + "epoch": 1.4385026737967914, + "grad_norm": 2.9189376831054688, + "learning_rate": 4.750857857187645e-06, + "loss": 0.4853, + "step": 269 + }, + { + "epoch": 1.4438502673796791, + "grad_norm": 2.299985885620117, + "learning_rate": 4.749026927066479e-06, + "loss": 0.7066, + "step": 270 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 2.0745482444763184, + "learning_rate": 4.747189649322894e-06, + "loss": 0.5224, + "step": 271 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 3.8428823947906494, + "learning_rate": 4.745346029142397e-06, + "loss": 0.7391, + "step": 272 + }, + { + "epoch": 1.4598930481283423, + "grad_norm": 2.409541368484497, + "learning_rate": 4.743496071728396e-06, + "loss": 0.6529, + "step": 273 + }, + { + "epoch": 1.46524064171123, + "grad_norm": 2.810421943664551, + "learning_rate": 4.741639782302187e-06, + "loss": 0.453, + "step": 274 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 2.9112162590026855, + "learning_rate": 4.739777166102933e-06, + "loss": 0.5275, + "step": 275 + }, + { + "epoch": 1.4759358288770055, + "grad_norm": 2.653869390487671, + "learning_rate": 4.737908228387656e-06, + "loss": 0.5838, + "step": 276 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 2.7957050800323486, + "learning_rate": 4.736032974431222e-06, + "loss": 0.5719, + "step": 277 + }, + { + "epoch": 1.4866310160427807, + "grad_norm": 2.4398281574249268, + "learning_rate": 4.7341514095263214e-06, + "loss": 0.4318, + "step": 278 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 3.5739479064941406, + "learning_rate": 4.732263538983456e-06, + "loss": 0.6388, + "step": 279 + }, + { + "epoch": 1.4973262032085561, + "grad_norm": 3.433971405029297, + "learning_rate": 4.730369368130925e-06, + "loss": 0.6673, + "step": 280 + }, + { + "epoch": 1.5026737967914439, + "grad_norm": 3.205761432647705, + "learning_rate": 4.728468902314811e-06, + "loss": 1.2311, + "step": 281 + }, + { + "epoch": 1.5080213903743316, + "grad_norm": 2.8073904514312744, + "learning_rate": 4.726562146898963e-06, + "loss": 0.6467, + "step": 282 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 3.282175064086914, + "learning_rate": 4.72464910726498e-06, + "loss": 0.6265, + "step": 283 + }, + { + "epoch": 1.5187165775401068, + "grad_norm": 3.5575335025787354, + "learning_rate": 4.7227297888121985e-06, + "loss": 0.8415, + "step": 284 + }, + { + "epoch": 1.5240641711229945, + "grad_norm": 2.851593255996704, + "learning_rate": 4.720804196957676e-06, + "loss": 0.6441, + "step": 285 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 2.8091742992401123, + "learning_rate": 4.718872337136176e-06, + "loss": 0.8297, + "step": 286 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 2.456247091293335, + "learning_rate": 4.716934214800155e-06, + "loss": 0.9988, + "step": 287 + }, + { + "epoch": 1.5401069518716577, + "grad_norm": 2.6044399738311768, + "learning_rate": 4.714989835419741e-06, + "loss": 0.5931, + "step": 288 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 3.5424976348876953, + "learning_rate": 4.713039204482723e-06, + "loss": 0.5902, + "step": 289 + }, + { + "epoch": 1.5508021390374331, + "grad_norm": 3.1387109756469727, + "learning_rate": 4.711082327494536e-06, + "loss": 0.7356, + "step": 290 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 3.1310863494873047, + "learning_rate": 4.709119209978242e-06, + "loss": 0.529, + "step": 291 + }, + { + "epoch": 1.5614973262032086, + "grad_norm": 2.778148651123047, + "learning_rate": 4.707149857474516e-06, + "loss": 0.4536, + "step": 292 + }, + { + "epoch": 1.5668449197860963, + "grad_norm": 2.308875560760498, + "learning_rate": 4.705174275541632e-06, + "loss": 0.5565, + "step": 293 + }, + { + "epoch": 1.572192513368984, + "grad_norm": 2.531953811645508, + "learning_rate": 4.703192469755444e-06, + "loss": 0.728, + "step": 294 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 2.6498258113861084, + "learning_rate": 4.701204445709375e-06, + "loss": 0.6269, + "step": 295 + }, + { + "epoch": 1.5828877005347595, + "grad_norm": 2.500495195388794, + "learning_rate": 4.699210209014394e-06, + "loss": 0.658, + "step": 296 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 2.733893394470215, + "learning_rate": 4.69720976529901e-06, + "loss": 0.5184, + "step": 297 + }, + { + "epoch": 1.593582887700535, + "grad_norm": 2.8712120056152344, + "learning_rate": 4.695203120209245e-06, + "loss": 0.5321, + "step": 298 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 2.467778205871582, + "learning_rate": 4.693190279408628e-06, + "loss": 0.4647, + "step": 299 + }, + { + "epoch": 1.6042780748663101, + "grad_norm": 2.4705379009246826, + "learning_rate": 4.691171248578172e-06, + "loss": 0.4889, + "step": 300 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 2.4136300086975098, + "learning_rate": 4.689146033416362e-06, + "loss": 0.6621, + "step": 301 + }, + { + "epoch": 1.6149732620320856, + "grad_norm": 2.042703151702881, + "learning_rate": 4.687114639639136e-06, + "loss": 0.4009, + "step": 302 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 3.224032402038574, + "learning_rate": 4.685077072979874e-06, + "loss": 0.5065, + "step": 303 + }, + { + "epoch": 1.6256684491978608, + "grad_norm": 3.0109472274780273, + "learning_rate": 4.683033339189375e-06, + "loss": 0.5289, + "step": 304 + }, + { + "epoch": 1.6310160427807485, + "grad_norm": 2.7306134700775146, + "learning_rate": 4.680983444035843e-06, + "loss": 0.7078, + "step": 305 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 3.4351847171783447, + "learning_rate": 4.678927393304877e-06, + "loss": 0.4003, + "step": 306 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 2.6287615299224854, + "learning_rate": 4.676865192799443e-06, + "loss": 0.4802, + "step": 307 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 2.7532455921173096, + "learning_rate": 4.6747968483398695e-06, + "loss": 0.8128, + "step": 308 + }, + { + "epoch": 1.6524064171122994, + "grad_norm": 2.49472975730896, + "learning_rate": 4.672722365763821e-06, + "loss": 0.4085, + "step": 309 + }, + { + "epoch": 1.6577540106951871, + "grad_norm": 2.805548667907715, + "learning_rate": 4.6706417509262905e-06, + "loss": 0.5707, + "step": 310 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 3.333185911178589, + "learning_rate": 4.668555009699575e-06, + "loss": 0.481, + "step": 311 + }, + { + "epoch": 1.6684491978609626, + "grad_norm": 2.704253673553467, + "learning_rate": 4.666462147973264e-06, + "loss": 0.6021, + "step": 312 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 3.070093870162964, + "learning_rate": 4.664363171654223e-06, + "loss": 0.7208, + "step": 313 + }, + { + "epoch": 1.679144385026738, + "grad_norm": 3.5783073902130127, + "learning_rate": 4.662258086666571e-06, + "loss": 0.9136, + "step": 314 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 2.5549259185791016, + "learning_rate": 4.660146898951674e-06, + "loss": 0.7375, + "step": 315 + }, + { + "epoch": 1.6898395721925135, + "grad_norm": 3.192612886428833, + "learning_rate": 4.6580296144681155e-06, + "loss": 0.6786, + "step": 316 + }, + { + "epoch": 1.6951871657754012, + "grad_norm": 4.031966209411621, + "learning_rate": 4.655906239191693e-06, + "loss": 0.789, + "step": 317 + }, + { + "epoch": 1.700534759358289, + "grad_norm": 2.8713667392730713, + "learning_rate": 4.653776779115389e-06, + "loss": 0.7104, + "step": 318 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 7.210184097290039, + "learning_rate": 4.651641240249364e-06, + "loss": 0.5165, + "step": 319 + }, + { + "epoch": 1.7112299465240641, + "grad_norm": 2.636258602142334, + "learning_rate": 4.649499628620931e-06, + "loss": 0.4081, + "step": 320 + }, + { + "epoch": 1.7165775401069518, + "grad_norm": 2.4294848442077637, + "learning_rate": 4.647351950274548e-06, + "loss": 0.6536, + "step": 321 + }, + { + "epoch": 1.7219251336898396, + "grad_norm": 2.551454544067383, + "learning_rate": 4.6451982112717896e-06, + "loss": 0.6597, + "step": 322 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 8.412546157836914, + "learning_rate": 4.643038417691341e-06, + "loss": 0.7608, + "step": 323 + }, + { + "epoch": 1.732620320855615, + "grad_norm": 2.47556734085083, + "learning_rate": 4.640872575628973e-06, + "loss": 0.4597, + "step": 324 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 3.3347442150115967, + "learning_rate": 4.6387006911975275e-06, + "loss": 0.7241, + "step": 325 + }, + { + "epoch": 1.7433155080213902, + "grad_norm": 3.182422637939453, + "learning_rate": 4.6365227705269026e-06, + "loss": 0.7654, + "step": 326 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 2.947328805923462, + "learning_rate": 4.634338819764029e-06, + "loss": 0.6391, + "step": 327 + }, + { + "epoch": 1.7540106951871657, + "grad_norm": 3.1109538078308105, + "learning_rate": 4.632148845072861e-06, + "loss": 0.5501, + "step": 328 + }, + { + "epoch": 1.7593582887700534, + "grad_norm": 3.0903382301330566, + "learning_rate": 4.6299528526343525e-06, + "loss": 0.6117, + "step": 329 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 3.745351552963257, + "learning_rate": 4.627750848646443e-06, + "loss": 0.8534, + "step": 330 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 3.4808154106140137, + "learning_rate": 4.625542839324036e-06, + "loss": 0.6352, + "step": 331 + }, + { + "epoch": 1.7754010695187166, + "grad_norm": 2.984961748123169, + "learning_rate": 4.6233288308989874e-06, + "loss": 0.4188, + "step": 332 + }, + { + "epoch": 1.7807486631016043, + "grad_norm": 2.6888809204101562, + "learning_rate": 4.6211088296200834e-06, + "loss": 0.4464, + "step": 333 + }, + { + "epoch": 1.786096256684492, + "grad_norm": 2.868077039718628, + "learning_rate": 4.618882841753026e-06, + "loss": 0.6833, + "step": 334 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 2.7746901512145996, + "learning_rate": 4.616650873580411e-06, + "loss": 0.6356, + "step": 335 + }, + { + "epoch": 1.7967914438502675, + "grad_norm": 3.0901777744293213, + "learning_rate": 4.614412931401715e-06, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 3.2670090198516846, + "learning_rate": 4.612169021533276e-06, + "loss": 0.5275, + "step": 337 + }, + { + "epoch": 1.807486631016043, + "grad_norm": 2.9879071712493896, + "learning_rate": 4.609919150308273e-06, + "loss": 0.6292, + "step": 338 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 2.9089176654815674, + "learning_rate": 4.607663324076711e-06, + "loss": 0.5315, + "step": 339 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 2.698115587234497, + "learning_rate": 4.605401549205404e-06, + "loss": 0.7492, + "step": 340 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 3.423445463180542, + "learning_rate": 4.603133832077953e-06, + "loss": 0.6453, + "step": 341 + }, + { + "epoch": 1.8288770053475936, + "grad_norm": 2.504528045654297, + "learning_rate": 4.600860179094732e-06, + "loss": 0.6502, + "step": 342 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 3.743797540664673, + "learning_rate": 4.5985805966728675e-06, + "loss": 0.6807, + "step": 343 + }, + { + "epoch": 1.839572192513369, + "grad_norm": 2.732316732406616, + "learning_rate": 4.596295091246221e-06, + "loss": 0.5235, + "step": 344 + }, + { + "epoch": 1.8449197860962567, + "grad_norm": 2.681244134902954, + "learning_rate": 4.594003669265371e-06, + "loss": 0.5847, + "step": 345 + }, + { + "epoch": 1.8502673796791442, + "grad_norm": 2.7608835697174072, + "learning_rate": 4.591706337197597e-06, + "loss": 0.6266, + "step": 346 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 3.0770840644836426, + "learning_rate": 4.589403101526854e-06, + "loss": 0.5021, + "step": 347 + }, + { + "epoch": 1.8609625668449197, + "grad_norm": 2.7511236667633057, + "learning_rate": 4.587093968753765e-06, + "loss": 0.6426, + "step": 348 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 2.199262857437134, + "learning_rate": 4.584778945395594e-06, + "loss": 0.41, + "step": 349 + }, + { + "epoch": 1.8716577540106951, + "grad_norm": 4.125847816467285, + "learning_rate": 4.582458037986231e-06, + "loss": 0.6775, + "step": 350 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 3.585446834564209, + "learning_rate": 4.580131253076171e-06, + "loss": 0.9407, + "step": 351 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 3.3022289276123047, + "learning_rate": 4.5777985972325016e-06, + "loss": 0.6412, + "step": 352 + }, + { + "epoch": 1.8877005347593583, + "grad_norm": 2.9012153148651123, + "learning_rate": 4.575460077038877e-06, + "loss": 0.4353, + "step": 353 + }, + { + "epoch": 1.893048128342246, + "grad_norm": 3.134577989578247, + "learning_rate": 4.573115699095505e-06, + "loss": 0.934, + "step": 354 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 2.8544585704803467, + "learning_rate": 4.570765470019125e-06, + "loss": 0.472, + "step": 355 + }, + { + "epoch": 1.9037433155080214, + "grad_norm": 3.232541084289551, + "learning_rate": 4.5684093964429906e-06, + "loss": 0.6079, + "step": 356 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 2.760040044784546, + "learning_rate": 4.566047485016853e-06, + "loss": 0.4644, + "step": 357 + }, + { + "epoch": 1.914438502673797, + "grad_norm": 3.5607728958129883, + "learning_rate": 4.563679742406935e-06, + "loss": 0.721, + "step": 358 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 2.364783763885498, + "learning_rate": 4.5613061752959236e-06, + "loss": 1.0296, + "step": 359 + }, + { + "epoch": 1.9251336898395723, + "grad_norm": 3.172856092453003, + "learning_rate": 4.558926790382941e-06, + "loss": 0.892, + "step": 360 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 3.1738357543945312, + "learning_rate": 4.556541594383528e-06, + "loss": 0.6153, + "step": 361 + }, + { + "epoch": 1.9358288770053476, + "grad_norm": 2.396540880203247, + "learning_rate": 4.554150594029631e-06, + "loss": 0.3246, + "step": 362 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 2.347179651260376, + "learning_rate": 4.551753796069577e-06, + "loss": 0.5986, + "step": 363 + }, + { + "epoch": 1.946524064171123, + "grad_norm": 2.559436082839966, + "learning_rate": 4.5493512072680535e-06, + "loss": 0.5642, + "step": 364 + }, + { + "epoch": 1.9518716577540107, + "grad_norm": 2.5733461380004883, + "learning_rate": 4.546942834406094e-06, + "loss": 0.7661, + "step": 365 + }, + { + "epoch": 1.9572192513368984, + "grad_norm": 2.9867851734161377, + "learning_rate": 4.544528684281056e-06, + "loss": 0.4739, + "step": 366 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 2.3558244705200195, + "learning_rate": 4.5421087637066065e-06, + "loss": 0.4551, + "step": 367 + }, + { + "epoch": 1.9679144385026737, + "grad_norm": 2.438739061355591, + "learning_rate": 4.539683079512692e-06, + "loss": 0.7336, + "step": 368 + }, + { + "epoch": 1.9732620320855614, + "grad_norm": 2.9113192558288574, + "learning_rate": 4.537251638545532e-06, + "loss": 0.5833, + "step": 369 + }, + { + "epoch": 1.9786096256684491, + "grad_norm": 2.915750741958618, + "learning_rate": 4.534814447667591e-06, + "loss": 0.3305, + "step": 370 + }, + { + "epoch": 1.9839572192513368, + "grad_norm": 2.14119815826416, + "learning_rate": 4.532371513757564e-06, + "loss": 0.4912, + "step": 371 + }, + { + "epoch": 1.9893048128342246, + "grad_norm": 2.589812994003296, + "learning_rate": 4.529922843710354e-06, + "loss": 0.611, + "step": 372 + }, + { + "epoch": 1.9946524064171123, + "grad_norm": 2.771989345550537, + "learning_rate": 4.52746844443705e-06, + "loss": 0.6487, + "step": 373 + }, + { + "epoch": 2.0, + "grad_norm": 2.7459375858306885, + "learning_rate": 4.525008322864917e-06, + "loss": 0.607, + "step": 374 + } + ], + "logging_steps": 1, + "max_steps": 1870, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.940141544046592e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_combined/trainer_log.jsonl b/metallama3_8b/limo_filtered_combined/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2743dae8b346b54055f6f24552031f52db4b035 --- /dev/null +++ b/metallama3_8b/limo_filtered_combined/trainer_log.jsonl @@ -0,0 +1,480 @@ +{"current_steps": 1, "total_steps": 1870, "loss": 0.9394, "lr": 5e-06, "epoch": 0.0053475935828877, "percentage": 0.05, "elapsed_time": "0:00:02", "remaining_time": "1:15:32"} +{"current_steps": 2, "total_steps": 1870, "loss": 2.0122, "lr": 4.99999647201733e-06, "epoch": 0.0106951871657754, "percentage": 0.11, "elapsed_time": "0:00:05", "remaining_time": "1:28:59"} +{"current_steps": 3, "total_steps": 1870, "loss": 1.0092, "lr": 4.999985888079276e-06, "epoch": 0.016042780748663103, "percentage": 0.16, "elapsed_time": "0:00:07", "remaining_time": "1:14:30"} +{"current_steps": 4, "total_steps": 1870, "loss": 1.5196, "lr": 4.999968248215712e-06, "epoch": 0.0213903743315508, "percentage": 0.21, "elapsed_time": "0:00:12", "remaining_time": "1:33:26"} +{"current_steps": 5, "total_steps": 1870, "loss": 1.4586, "lr": 4.999943552476422e-06, "epoch": 0.026737967914438502, "percentage": 0.27, "elapsed_time": "0:00:15", "remaining_time": "1:37:56"} +{"current_steps": 6, "total_steps": 1870, "loss": 1.1068, "lr": 4.999911800931108e-06, "epoch": 0.03208556149732621, "percentage": 0.32, "elapsed_time": "0:00:17", "remaining_time": "1:32:10"} +{"current_steps": 7, "total_steps": 1870, "loss": 0.8997, "lr": 4.999872993669387e-06, "epoch": 0.0374331550802139, "percentage": 0.37, "elapsed_time": "0:00:19", "remaining_time": "1:25:14"} +{"current_steps": 8, "total_steps": 1870, "loss": 1.075, "lr": 4.999827130800785e-06, "epoch": 0.0427807486631016, "percentage": 0.43, "elapsed_time": "0:00:21", "remaining_time": "1:23:32"} +{"current_steps": 9, "total_steps": 1870, "loss": 1.691, "lr": 4.999774212454746e-06, "epoch": 0.0481283422459893, "percentage": 0.48, "elapsed_time": "0:00:24", "remaining_time": "1:25:50"} +{"current_steps": 10, "total_steps": 1870, "loss": 1.3167, "lr": 4.999714238780626e-06, "epoch": 0.053475935828877004, "percentage": 0.53, "elapsed_time": "0:00:32", "remaining_time": "1:39:43"} +{"current_steps": 11, "total_steps": 1870, "loss": 0.9653, "lr": 4.999647209947694e-06, "epoch": 0.058823529411764705, "percentage": 0.59, "elapsed_time": "0:00:35", "remaining_time": "1:39:46"} +{"current_steps": 12, "total_steps": 1870, "loss": 1.2992, "lr": 4.999573126145132e-06, "epoch": 0.06417112299465241, "percentage": 0.64, "elapsed_time": "0:00:38", "remaining_time": "1:40:28"} +{"current_steps": 13, "total_steps": 1870, "loss": 0.9204, "lr": 4.999491987582032e-06, "epoch": 0.06951871657754011, "percentage": 0.7, "elapsed_time": "0:00:43", "remaining_time": "1:42:58"} +{"current_steps": 14, "total_steps": 1870, "loss": 1.3307, "lr": 4.999403794487399e-06, "epoch": 0.0748663101604278, "percentage": 0.75, "elapsed_time": "0:00:48", "remaining_time": "1:48:07"} +{"current_steps": 15, "total_steps": 1870, "loss": 0.8596, "lr": 4.999308547110147e-06, "epoch": 0.08021390374331551, "percentage": 0.8, "elapsed_time": "0:00:50", "remaining_time": "1:45:02"} +{"current_steps": 16, "total_steps": 1870, "loss": 0.9614, "lr": 4.9992062457191005e-06, "epoch": 0.0855614973262032, "percentage": 0.86, "elapsed_time": "0:00:53", "remaining_time": "1:44:08"} +{"current_steps": 17, "total_steps": 1870, "loss": 0.8, "lr": 4.999096890602996e-06, "epoch": 0.09090909090909091, "percentage": 0.91, "elapsed_time": "0:00:57", "remaining_time": "1:45:20"} +{"current_steps": 18, "total_steps": 1870, "loss": 0.7683, "lr": 4.998980482070473e-06, "epoch": 0.0962566844919786, "percentage": 0.96, "elapsed_time": "0:01:01", "remaining_time": "1:45:59"} +{"current_steps": 19, "total_steps": 1870, "loss": 1.2742, "lr": 4.998857020450084e-06, "epoch": 0.10160427807486631, "percentage": 1.02, "elapsed_time": "0:01:04", "remaining_time": "1:45:01"} +{"current_steps": 20, "total_steps": 1870, "loss": 0.8559, "lr": 4.998726506090283e-06, "epoch": 0.10695187165775401, "percentage": 1.07, "elapsed_time": "0:01:06", "remaining_time": "1:43:14"} +{"current_steps": 21, "total_steps": 1870, "loss": 0.8223, "lr": 4.998588939359435e-06, "epoch": 0.11229946524064172, "percentage": 1.12, "elapsed_time": "0:01:08", "remaining_time": "1:40:41"} +{"current_steps": 22, "total_steps": 1870, "loss": 1.1229, "lr": 4.998444320645803e-06, "epoch": 0.11764705882352941, "percentage": 1.18, "elapsed_time": "0:01:14", "remaining_time": "1:44:29"} +{"current_steps": 23, "total_steps": 1870, "loss": 0.8936, "lr": 4.998292650357558e-06, "epoch": 0.12299465240641712, "percentage": 1.23, "elapsed_time": "0:01:18", "remaining_time": "1:44:27"} +{"current_steps": 24, "total_steps": 1870, "loss": 1.2552, "lr": 4.998133928922773e-06, "epoch": 0.12834224598930483, "percentage": 1.28, "elapsed_time": "0:01:24", "remaining_time": "1:48:01"} +{"current_steps": 25, "total_steps": 1870, "loss": 0.7075, "lr": 4.99796815678942e-06, "epoch": 0.13368983957219252, "percentage": 1.34, "elapsed_time": "0:01:27", "remaining_time": "1:47:26"} +{"current_steps": 26, "total_steps": 1870, "loss": 0.9781, "lr": 4.997795334425372e-06, "epoch": 0.13903743315508021, "percentage": 1.39, "elapsed_time": "0:01:33", "remaining_time": "1:50:57"} +{"current_steps": 27, "total_steps": 1870, "loss": 1.0657, "lr": 4.997615462318403e-06, "epoch": 0.1443850267379679, "percentage": 1.44, "elapsed_time": "0:01:40", "remaining_time": "1:54:43"} +{"current_steps": 28, "total_steps": 1870, "loss": 0.951, "lr": 4.997428540976177e-06, "epoch": 0.1497326203208556, "percentage": 1.5, "elapsed_time": "0:01:45", "remaining_time": "1:55:17"} +{"current_steps": 29, "total_steps": 1870, "loss": 0.6788, "lr": 4.997234570926263e-06, "epoch": 0.15508021390374332, "percentage": 1.55, "elapsed_time": "0:01:48", "remaining_time": "1:54:49"} +{"current_steps": 30, "total_steps": 1870, "loss": 0.8, "lr": 4.997033552716116e-06, "epoch": 0.16042780748663102, "percentage": 1.6, "elapsed_time": "0:01:50", "remaining_time": "1:52:46"} +{"current_steps": 31, "total_steps": 1870, "loss": 0.7625, "lr": 4.9968254869130885e-06, "epoch": 0.1657754010695187, "percentage": 1.66, "elapsed_time": "0:01:52", "remaining_time": "1:51:09"} +{"current_steps": 32, "total_steps": 1870, "loss": 0.7381, "lr": 4.996610374104422e-06, "epoch": 0.1711229946524064, "percentage": 1.71, "elapsed_time": "0:01:53", "remaining_time": "1:48:19"} +{"current_steps": 33, "total_steps": 1870, "loss": 1.3283, "lr": 4.9963882148972475e-06, "epoch": 0.17647058823529413, "percentage": 1.76, "elapsed_time": "0:01:54", "remaining_time": "1:46:20"} +{"current_steps": 34, "total_steps": 1870, "loss": 1.0002, "lr": 4.996159009918586e-06, "epoch": 0.18181818181818182, "percentage": 1.82, "elapsed_time": "0:01:55", "remaining_time": "1:44:00"} +{"current_steps": 35, "total_steps": 1870, "loss": 0.9095, "lr": 4.9959227598153395e-06, "epoch": 0.18716577540106952, "percentage": 1.87, "elapsed_time": "0:01:59", "remaining_time": "1:44:51"} +{"current_steps": 36, "total_steps": 1870, "loss": 1.2191, "lr": 4.9956794652542994e-06, "epoch": 0.1925133689839572, "percentage": 1.93, "elapsed_time": "0:02:03", "remaining_time": "1:44:39"} +{"current_steps": 37, "total_steps": 1870, "loss": 0.7424, "lr": 4.9954291269221364e-06, "epoch": 0.19786096256684493, "percentage": 1.98, "elapsed_time": "0:02:06", "remaining_time": "1:44:04"} +{"current_steps": 38, "total_steps": 1870, "loss": 0.9289, "lr": 4.995171745525401e-06, "epoch": 0.20320855614973263, "percentage": 2.03, "elapsed_time": "0:02:09", "remaining_time": "1:44:06"} +{"current_steps": 39, "total_steps": 1870, "loss": 0.991, "lr": 4.994907321790524e-06, "epoch": 0.20855614973262032, "percentage": 2.09, "elapsed_time": "0:02:16", "remaining_time": "1:47:11"} +{"current_steps": 40, "total_steps": 1870, "loss": 0.6406, "lr": 4.994635856463811e-06, "epoch": 0.21390374331550802, "percentage": 2.14, "elapsed_time": "0:02:20", "remaining_time": "1:47:07"} +{"current_steps": 41, "total_steps": 1870, "loss": 1.2038, "lr": 4.994357350311441e-06, "epoch": 0.2192513368983957, "percentage": 2.19, "elapsed_time": "0:02:26", "remaining_time": "1:49:16"} +{"current_steps": 42, "total_steps": 1870, "loss": 0.9696, "lr": 4.994071804119467e-06, "epoch": 0.22459893048128343, "percentage": 2.25, "elapsed_time": "0:02:29", "remaining_time": "1:48:48"} +{"current_steps": 43, "total_steps": 1870, "loss": 1.1579, "lr": 4.993779218693811e-06, "epoch": 0.22994652406417113, "percentage": 2.3, "elapsed_time": "0:02:33", "remaining_time": "1:48:32"} +{"current_steps": 44, "total_steps": 1870, "loss": 0.7118, "lr": 4.99347959486026e-06, "epoch": 0.23529411764705882, "percentage": 2.35, "elapsed_time": "0:02:37", "remaining_time": "1:48:48"} +{"current_steps": 45, "total_steps": 1870, "loss": 0.7579, "lr": 4.99317293346447e-06, "epoch": 0.24064171122994651, "percentage": 2.41, "elapsed_time": "0:02:39", "remaining_time": "1:48:07"} +{"current_steps": 46, "total_steps": 1870, "loss": 0.7105, "lr": 4.992859235371958e-06, "epoch": 0.24598930481283424, "percentage": 2.46, "elapsed_time": "0:02:42", "remaining_time": "1:47:04"} +{"current_steps": 47, "total_steps": 1870, "loss": 0.6812, "lr": 4.992538501468101e-06, "epoch": 0.25133689839572193, "percentage": 2.51, "elapsed_time": "0:02:45", "remaining_time": "1:46:54"} +{"current_steps": 48, "total_steps": 1870, "loss": 0.9733, "lr": 4.992210732658132e-06, "epoch": 0.25668449197860965, "percentage": 2.57, "elapsed_time": "0:02:52", "remaining_time": "1:49:21"} +{"current_steps": 49, "total_steps": 1870, "loss": 1.1301, "lr": 4.991875929867143e-06, "epoch": 0.2620320855614973, "percentage": 2.62, "elapsed_time": "0:02:59", "remaining_time": "1:50:52"} +{"current_steps": 50, "total_steps": 1870, "loss": 0.6706, "lr": 4.991534094040077e-06, "epoch": 0.26737967914438504, "percentage": 2.67, "elapsed_time": "0:03:02", "remaining_time": "1:50:51"} +{"current_steps": 51, "total_steps": 1870, "loss": 0.9462, "lr": 4.991185226141726e-06, "epoch": 0.2727272727272727, "percentage": 2.73, "elapsed_time": "0:03:03", "remaining_time": "1:49:17"} +{"current_steps": 52, "total_steps": 1870, "loss": 1.0714, "lr": 4.990829327156729e-06, "epoch": 0.27807486631016043, "percentage": 2.78, "elapsed_time": "0:03:06", "remaining_time": "1:48:35"} +{"current_steps": 53, "total_steps": 1870, "loss": 0.9175, "lr": 4.990466398089571e-06, "epoch": 0.28342245989304815, "percentage": 2.83, "elapsed_time": "0:03:07", "remaining_time": "1:47:14"} +{"current_steps": 54, "total_steps": 1870, "loss": 0.5164, "lr": 4.99009643996458e-06, "epoch": 0.2887700534759358, "percentage": 2.89, "elapsed_time": "0:03:10", "remaining_time": "1:46:35"} +{"current_steps": 55, "total_steps": 1870, "loss": 0.7223, "lr": 4.989719453825918e-06, "epoch": 0.29411764705882354, "percentage": 2.94, "elapsed_time": "0:03:12", "remaining_time": "1:45:36"} +{"current_steps": 56, "total_steps": 1870, "loss": 0.7065, "lr": 4.989335440737587e-06, "epoch": 0.2994652406417112, "percentage": 2.99, "elapsed_time": "0:03:13", "remaining_time": "1:44:33"} +{"current_steps": 57, "total_steps": 1870, "loss": 0.8833, "lr": 4.9889444017834185e-06, "epoch": 0.3048128342245989, "percentage": 3.05, "elapsed_time": "0:03:15", "remaining_time": "1:43:41"} +{"current_steps": 58, "total_steps": 1870, "loss": 0.8664, "lr": 4.988546338067078e-06, "epoch": 0.31016042780748665, "percentage": 3.1, "elapsed_time": "0:03:18", "remaining_time": "1:43:15"} +{"current_steps": 59, "total_steps": 1870, "loss": 0.884, "lr": 4.988141250712053e-06, "epoch": 0.3155080213903743, "percentage": 3.16, "elapsed_time": "0:03:23", "remaining_time": "1:44:06"} +{"current_steps": 60, "total_steps": 1870, "loss": 0.9891, "lr": 4.987729140861657e-06, "epoch": 0.32085561497326204, "percentage": 3.21, "elapsed_time": "0:03:24", "remaining_time": "1:43:02"} +{"current_steps": 61, "total_steps": 1870, "loss": 0.8838, "lr": 4.987310009679023e-06, "epoch": 0.32620320855614976, "percentage": 3.26, "elapsed_time": "0:03:27", "remaining_time": "1:42:22"} +{"current_steps": 62, "total_steps": 1870, "loss": 0.8188, "lr": 4.986883858347101e-06, "epoch": 0.3315508021390374, "percentage": 3.32, "elapsed_time": "0:03:29", "remaining_time": "1:41:36"} +{"current_steps": 63, "total_steps": 1870, "loss": 0.6032, "lr": 4.986450688068655e-06, "epoch": 0.33689839572192515, "percentage": 3.37, "elapsed_time": "0:03:31", "remaining_time": "1:41:03"} +{"current_steps": 64, "total_steps": 1870, "loss": 0.7623, "lr": 4.986010500066258e-06, "epoch": 0.3422459893048128, "percentage": 3.42, "elapsed_time": "0:03:34", "remaining_time": "1:41:03"} +{"current_steps": 65, "total_steps": 1870, "loss": 0.8051, "lr": 4.985563295582292e-06, "epoch": 0.34759358288770054, "percentage": 3.48, "elapsed_time": "0:03:36", "remaining_time": "1:40:17"} +{"current_steps": 66, "total_steps": 1870, "loss": 0.7901, "lr": 4.98510907587894e-06, "epoch": 0.35294117647058826, "percentage": 3.53, "elapsed_time": "0:03:38", "remaining_time": "1:39:37"} +{"current_steps": 67, "total_steps": 1870, "loss": 1.0582, "lr": 4.984647842238185e-06, "epoch": 0.3582887700534759, "percentage": 3.58, "elapsed_time": "0:03:45", "remaining_time": "1:40:58"} +{"current_steps": 68, "total_steps": 1870, "loss": 0.5912, "lr": 4.984179595961806e-06, "epoch": 0.36363636363636365, "percentage": 3.64, "elapsed_time": "0:03:48", "remaining_time": "1:40:51"} +{"current_steps": 69, "total_steps": 1870, "loss": 0.7855, "lr": 4.983704338371375e-06, "epoch": 0.3689839572192513, "percentage": 3.69, "elapsed_time": "0:03:49", "remaining_time": "1:40:01"} +{"current_steps": 70, "total_steps": 1870, "loss": 0.6491, "lr": 4.983222070808255e-06, "epoch": 0.37433155080213903, "percentage": 3.74, "elapsed_time": "0:03:52", "remaining_time": "1:39:34"} +{"current_steps": 71, "total_steps": 1870, "loss": 0.9735, "lr": 4.982732794633588e-06, "epoch": 0.37967914438502676, "percentage": 3.8, "elapsed_time": "0:03:54", "remaining_time": "1:39:03"} +{"current_steps": 72, "total_steps": 1870, "loss": 0.8495, "lr": 4.982236511228301e-06, "epoch": 0.3850267379679144, "percentage": 3.85, "elapsed_time": "0:03:55", "remaining_time": "1:38:02"} +{"current_steps": 73, "total_steps": 1870, "loss": 1.0891, "lr": 4.981733221993099e-06, "epoch": 0.39037433155080214, "percentage": 3.9, "elapsed_time": "0:04:01", "remaining_time": "1:39:00"} +{"current_steps": 74, "total_steps": 1870, "loss": 0.8013, "lr": 4.981222928348456e-06, "epoch": 0.39572192513368987, "percentage": 3.96, "elapsed_time": "0:04:04", "remaining_time": "1:38:56"} +{"current_steps": 75, "total_steps": 1870, "loss": 0.8298, "lr": 4.98070563173462e-06, "epoch": 0.40106951871657753, "percentage": 4.01, "elapsed_time": "0:04:09", "remaining_time": "1:39:41"} +{"current_steps": 76, "total_steps": 1870, "loss": 0.6989, "lr": 4.980181333611601e-06, "epoch": 0.40641711229946526, "percentage": 4.06, "elapsed_time": "0:04:13", "remaining_time": "1:39:54"} +{"current_steps": 77, "total_steps": 1870, "loss": 0.6769, "lr": 4.979650035459171e-06, "epoch": 0.4117647058823529, "percentage": 4.12, "elapsed_time": "0:04:16", "remaining_time": "1:39:36"} +{"current_steps": 78, "total_steps": 1870, "loss": 1.0385, "lr": 4.9791117387768575e-06, "epoch": 0.41711229946524064, "percentage": 4.17, "elapsed_time": "0:04:20", "remaining_time": "1:39:34"} +{"current_steps": 79, "total_steps": 1870, "loss": 0.6498, "lr": 4.978566445083942e-06, "epoch": 0.42245989304812837, "percentage": 4.22, "elapsed_time": "0:04:22", "remaining_time": "1:39:10"} +{"current_steps": 80, "total_steps": 1870, "loss": 0.7931, "lr": 4.978014155919455e-06, "epoch": 0.42780748663101603, "percentage": 4.28, "elapsed_time": "0:04:23", "remaining_time": "1:38:13"} +{"current_steps": 81, "total_steps": 1870, "loss": 0.7322, "lr": 4.977454872842169e-06, "epoch": 0.43315508021390375, "percentage": 4.33, "elapsed_time": "0:04:26", "remaining_time": "1:37:58"} +{"current_steps": 82, "total_steps": 1870, "loss": 0.9184, "lr": 4.976888597430597e-06, "epoch": 0.4385026737967914, "percentage": 4.39, "elapsed_time": "0:04:30", "remaining_time": "1:38:16"} +{"current_steps": 83, "total_steps": 1870, "loss": 0.8258, "lr": 4.976315331282985e-06, "epoch": 0.44385026737967914, "percentage": 4.44, "elapsed_time": "0:04:34", "remaining_time": "1:38:29"} +{"current_steps": 84, "total_steps": 1870, "loss": 0.7414, "lr": 4.9757350760173144e-06, "epoch": 0.44919786096256686, "percentage": 4.49, "elapsed_time": "0:04:37", "remaining_time": "1:38:21"} +{"current_steps": 85, "total_steps": 1870, "loss": 0.8573, "lr": 4.975147833271288e-06, "epoch": 0.45454545454545453, "percentage": 4.55, "elapsed_time": "0:04:41", "remaining_time": "1:38:31"} +{"current_steps": 86, "total_steps": 1870, "loss": 0.7271, "lr": 4.974553604702332e-06, "epoch": 0.45989304812834225, "percentage": 4.6, "elapsed_time": "0:04:44", "remaining_time": "1:38:23"} +{"current_steps": 87, "total_steps": 1870, "loss": 0.8976, "lr": 4.973952391987589e-06, "epoch": 0.46524064171123, "percentage": 4.65, "elapsed_time": "0:04:45", "remaining_time": "1:37:36"} +{"current_steps": 88, "total_steps": 1870, "loss": 1.0753, "lr": 4.9733441968239125e-06, "epoch": 0.47058823529411764, "percentage": 4.71, "elapsed_time": "0:04:47", "remaining_time": "1:37:08"} +{"current_steps": 89, "total_steps": 1870, "loss": 0.6903, "lr": 4.972729020927866e-06, "epoch": 0.47593582887700536, "percentage": 4.76, "elapsed_time": "0:04:50", "remaining_time": "1:36:45"} +{"current_steps": 90, "total_steps": 1870, "loss": 0.9347, "lr": 4.97210686603571e-06, "epoch": 0.48128342245989303, "percentage": 4.81, "elapsed_time": "0:04:51", "remaining_time": "1:36:10"} +{"current_steps": 91, "total_steps": 1870, "loss": 0.6738, "lr": 4.97147773390341e-06, "epoch": 0.48663101604278075, "percentage": 4.87, "elapsed_time": "0:04:55", "remaining_time": "1:36:12"} +{"current_steps": 92, "total_steps": 1870, "loss": 0.8356, "lr": 4.970841626306617e-06, "epoch": 0.4919786096256685, "percentage": 4.92, "elapsed_time": "0:05:01", "remaining_time": "1:37:03"} +{"current_steps": 93, "total_steps": 1870, "loss": 0.9117, "lr": 4.970198545040673e-06, "epoch": 0.49732620320855614, "percentage": 4.97, "elapsed_time": "0:05:07", "remaining_time": "1:38:03"} +{"current_steps": 94, "total_steps": 1870, "loss": 0.8237, "lr": 4.969548491920603e-06, "epoch": 0.5026737967914439, "percentage": 5.03, "elapsed_time": "0:05:09", "remaining_time": "1:37:35"} +{"current_steps": 95, "total_steps": 1870, "loss": 0.8775, "lr": 4.968891468781105e-06, "epoch": 0.5080213903743316, "percentage": 5.08, "elapsed_time": "0:05:13", "remaining_time": "1:37:35"} +{"current_steps": 96, "total_steps": 1870, "loss": 0.9068, "lr": 4.968227477476554e-06, "epoch": 0.5133689839572193, "percentage": 5.13, "elapsed_time": "0:05:14", "remaining_time": "1:36:51"} +{"current_steps": 97, "total_steps": 1870, "loss": 1.0435, "lr": 4.9675565198809905e-06, "epoch": 0.5187165775401069, "percentage": 5.19, "elapsed_time": "0:05:19", "remaining_time": "1:37:15"} +{"current_steps": 98, "total_steps": 1870, "loss": 1.0084, "lr": 4.966878597888114e-06, "epoch": 0.5240641711229946, "percentage": 5.24, "elapsed_time": "0:05:22", "remaining_time": "1:37:12"} +{"current_steps": 99, "total_steps": 1870, "loss": 0.7217, "lr": 4.966193713411284e-06, "epoch": 0.5294117647058824, "percentage": 5.29, "elapsed_time": "0:05:24", "remaining_time": "1:36:44"} +{"current_steps": 100, "total_steps": 1870, "loss": 0.6594, "lr": 4.965501868383507e-06, "epoch": 0.5347593582887701, "percentage": 5.35, "elapsed_time": "0:05:28", "remaining_time": "1:36:51"} +{"current_steps": 101, "total_steps": 1870, "loss": 0.9249, "lr": 4.964803064757438e-06, "epoch": 0.5401069518716578, "percentage": 5.4, "elapsed_time": "0:05:35", "remaining_time": "1:37:54"} +{"current_steps": 102, "total_steps": 1870, "loss": 0.7776, "lr": 4.964097304505371e-06, "epoch": 0.5454545454545454, "percentage": 5.45, "elapsed_time": "0:05:38", "remaining_time": "1:37:51"} +{"current_steps": 103, "total_steps": 1870, "loss": 0.6339, "lr": 4.963384589619233e-06, "epoch": 0.5508021390374331, "percentage": 5.51, "elapsed_time": "0:05:41", "remaining_time": "1:37:32"} +{"current_steps": 104, "total_steps": 1870, "loss": 1.0107, "lr": 4.962664922110581e-06, "epoch": 0.5561497326203209, "percentage": 5.56, "elapsed_time": "0:05:42", "remaining_time": "1:36:47"} +{"current_steps": 105, "total_steps": 1870, "loss": 1.0052, "lr": 4.9619383040105954e-06, "epoch": 0.5614973262032086, "percentage": 5.61, "elapsed_time": "0:05:43", "remaining_time": "1:36:14"} +{"current_steps": 106, "total_steps": 1870, "loss": 0.8577, "lr": 4.961204737370071e-06, "epoch": 0.5668449197860963, "percentage": 5.67, "elapsed_time": "0:05:47", "remaining_time": "1:36:24"} +{"current_steps": 107, "total_steps": 1870, "loss": 1.1237, "lr": 4.960464224259418e-06, "epoch": 0.5721925133689839, "percentage": 5.72, "elapsed_time": "0:05:50", "remaining_time": "1:36:17"} +{"current_steps": 108, "total_steps": 1870, "loss": 0.6237, "lr": 4.95971676676865e-06, "epoch": 0.5775401069518716, "percentage": 5.78, "elapsed_time": "0:05:52", "remaining_time": "1:35:57"} +{"current_steps": 109, "total_steps": 1870, "loss": 0.9135, "lr": 4.958962367007381e-06, "epoch": 0.5828877005347594, "percentage": 5.83, "elapsed_time": "0:05:54", "remaining_time": "1:35:28"} +{"current_steps": 110, "total_steps": 1870, "loss": 0.7461, "lr": 4.958201027104818e-06, "epoch": 0.5882352941176471, "percentage": 5.88, "elapsed_time": "0:05:57", "remaining_time": "1:35:19"} +{"current_steps": 111, "total_steps": 1870, "loss": 0.69, "lr": 4.957432749209755e-06, "epoch": 0.5935828877005348, "percentage": 5.94, "elapsed_time": "0:06:03", "remaining_time": "1:36:00"} +{"current_steps": 112, "total_steps": 1870, "loss": 0.8578, "lr": 4.95665753549057e-06, "epoch": 0.5989304812834224, "percentage": 5.99, "elapsed_time": "0:06:07", "remaining_time": "1:36:14"} +{"current_steps": 113, "total_steps": 1870, "loss": 1.3098, "lr": 4.9558753881352165e-06, "epoch": 0.6042780748663101, "percentage": 6.04, "elapsed_time": "0:06:10", "remaining_time": "1:35:58"} +{"current_steps": 114, "total_steps": 1870, "loss": 0.9979, "lr": 4.955086309351213e-06, "epoch": 0.6096256684491979, "percentage": 6.1, "elapsed_time": "0:06:13", "remaining_time": "1:35:46"} +{"current_steps": 115, "total_steps": 1870, "loss": 0.6298, "lr": 4.9542903013656485e-06, "epoch": 0.6149732620320856, "percentage": 6.15, "elapsed_time": "0:06:14", "remaining_time": "1:35:15"} +{"current_steps": 116, "total_steps": 1870, "loss": 0.959, "lr": 4.953487366425163e-06, "epoch": 0.6203208556149733, "percentage": 6.2, "elapsed_time": "0:06:18", "remaining_time": "1:35:27"} +{"current_steps": 117, "total_steps": 1870, "loss": 0.6791, "lr": 4.952677506795949e-06, "epoch": 0.6256684491978609, "percentage": 6.26, "elapsed_time": "0:06:20", "remaining_time": "1:34:58"} +{"current_steps": 118, "total_steps": 1870, "loss": 0.7783, "lr": 4.951860724763743e-06, "epoch": 0.6310160427807486, "percentage": 6.31, "elapsed_time": "0:06:23", "remaining_time": "1:34:49"} +{"current_steps": 119, "total_steps": 1870, "loss": 0.8085, "lr": 4.95103702263382e-06, "epoch": 0.6363636363636364, "percentage": 6.36, "elapsed_time": "0:06:25", "remaining_time": "1:34:28"} +{"current_steps": 120, "total_steps": 1870, "loss": 0.7702, "lr": 4.950206402730984e-06, "epoch": 0.6417112299465241, "percentage": 6.42, "elapsed_time": "0:06:28", "remaining_time": "1:34:32"} +{"current_steps": 121, "total_steps": 1870, "loss": 0.602, "lr": 4.949368867399567e-06, "epoch": 0.6470588235294118, "percentage": 6.47, "elapsed_time": "0:06:30", "remaining_time": "1:33:59"} +{"current_steps": 122, "total_steps": 1870, "loss": 1.2858, "lr": 4.948524419003415e-06, "epoch": 0.6524064171122995, "percentage": 6.52, "elapsed_time": "0:06:32", "remaining_time": "1:33:49"} +{"current_steps": 123, "total_steps": 1870, "loss": 0.7945, "lr": 4.947673059925889e-06, "epoch": 0.6577540106951871, "percentage": 6.58, "elapsed_time": "0:06:38", "remaining_time": "1:34:17"} +{"current_steps": 124, "total_steps": 1870, "loss": 0.959, "lr": 4.9468147925698525e-06, "epoch": 0.6631016042780749, "percentage": 6.63, "elapsed_time": "0:06:41", "remaining_time": "1:34:14"} +{"current_steps": 125, "total_steps": 1870, "loss": 0.7611, "lr": 4.945949619357668e-06, "epoch": 0.6684491978609626, "percentage": 6.68, "elapsed_time": "0:06:42", "remaining_time": "1:33:41"} +{"current_steps": 126, "total_steps": 1870, "loss": 0.5753, "lr": 4.945077542731188e-06, "epoch": 0.6737967914438503, "percentage": 6.74, "elapsed_time": "0:06:44", "remaining_time": "1:33:16"} +{"current_steps": 127, "total_steps": 1870, "loss": 0.8995, "lr": 4.94419856515175e-06, "epoch": 0.679144385026738, "percentage": 6.79, "elapsed_time": "0:06:49", "remaining_time": "1:33:43"} +{"current_steps": 128, "total_steps": 1870, "loss": 0.9623, "lr": 4.943312689100166e-06, "epoch": 0.6844919786096256, "percentage": 6.84, "elapsed_time": "0:06:51", "remaining_time": "1:33:20"} +{"current_steps": 129, "total_steps": 1870, "loss": 0.6657, "lr": 4.942419917076723e-06, "epoch": 0.6898395721925134, "percentage": 6.9, "elapsed_time": "0:06:55", "remaining_time": "1:33:21"} +{"current_steps": 130, "total_steps": 1870, "loss": 0.7711, "lr": 4.941520251601167e-06, "epoch": 0.6951871657754011, "percentage": 6.95, "elapsed_time": "0:06:57", "remaining_time": "1:33:05"} +{"current_steps": 131, "total_steps": 1870, "loss": 0.5908, "lr": 4.940613695212702e-06, "epoch": 0.7005347593582888, "percentage": 7.01, "elapsed_time": "0:06:58", "remaining_time": "1:32:34"} +{"current_steps": 132, "total_steps": 1870, "loss": 0.967, "lr": 4.939700250469979e-06, "epoch": 0.7058823529411765, "percentage": 7.06, "elapsed_time": "0:07:02", "remaining_time": "1:32:46"} +{"current_steps": 133, "total_steps": 1870, "loss": 0.9519, "lr": 4.938779919951092e-06, "epoch": 0.7112299465240641, "percentage": 7.11, "elapsed_time": "0:07:04", "remaining_time": "1:32:18"} +{"current_steps": 134, "total_steps": 1870, "loss": 0.5873, "lr": 4.93785270625357e-06, "epoch": 0.7165775401069518, "percentage": 7.17, "elapsed_time": "0:07:05", "remaining_time": "1:31:54"} +{"current_steps": 135, "total_steps": 1870, "loss": 0.8148, "lr": 4.936918611994368e-06, "epoch": 0.7219251336898396, "percentage": 7.22, "elapsed_time": "0:07:12", "remaining_time": "1:32:34"} +{"current_steps": 136, "total_steps": 1870, "loss": 0.8286, "lr": 4.935977639809861e-06, "epoch": 0.7272727272727273, "percentage": 7.27, "elapsed_time": "0:07:15", "remaining_time": "1:32:27"} +{"current_steps": 137, "total_steps": 1870, "loss": 0.6442, "lr": 4.935029792355834e-06, "epoch": 0.732620320855615, "percentage": 7.33, "elapsed_time": "0:07:17", "remaining_time": "1:32:13"} +{"current_steps": 138, "total_steps": 1870, "loss": 1.0144, "lr": 4.934075072307481e-06, "epoch": 0.7379679144385026, "percentage": 7.38, "elapsed_time": "0:07:20", "remaining_time": "1:32:07"} +{"current_steps": 139, "total_steps": 1870, "loss": 0.5922, "lr": 4.933113482359388e-06, "epoch": 0.7433155080213903, "percentage": 7.43, "elapsed_time": "0:07:22", "remaining_time": "1:31:45"} +{"current_steps": 140, "total_steps": 1870, "loss": 0.7546, "lr": 4.932145025225535e-06, "epoch": 0.7486631016042781, "percentage": 7.49, "elapsed_time": "0:07:23", "remaining_time": "1:31:17"} +{"current_steps": 141, "total_steps": 1870, "loss": 0.8797, "lr": 4.931169703639282e-06, "epoch": 0.7540106951871658, "percentage": 7.54, "elapsed_time": "0:07:25", "remaining_time": "1:31:02"} +{"current_steps": 142, "total_steps": 1870, "loss": 0.865, "lr": 4.930187520353363e-06, "epoch": 0.7593582887700535, "percentage": 7.59, "elapsed_time": "0:07:28", "remaining_time": "1:31:00"} +{"current_steps": 143, "total_steps": 1870, "loss": 0.6901, "lr": 4.929198478139877e-06, "epoch": 0.7647058823529411, "percentage": 7.65, "elapsed_time": "0:07:31", "remaining_time": "1:30:47"} +{"current_steps": 144, "total_steps": 1870, "loss": 0.5932, "lr": 4.928202579790285e-06, "epoch": 0.7700534759358288, "percentage": 7.7, "elapsed_time": "0:07:34", "remaining_time": "1:30:48"} +{"current_steps": 145, "total_steps": 1870, "loss": 0.7742, "lr": 4.927199828115395e-06, "epoch": 0.7754010695187166, "percentage": 7.75, "elapsed_time": "0:07:36", "remaining_time": "1:30:25"} +{"current_steps": 146, "total_steps": 1870, "loss": 0.8475, "lr": 4.9261902259453616e-06, "epoch": 0.7807486631016043, "percentage": 7.81, "elapsed_time": "0:07:39", "remaining_time": "1:30:29"} +{"current_steps": 147, "total_steps": 1870, "loss": 1.0514, "lr": 4.925173776129669e-06, "epoch": 0.786096256684492, "percentage": 7.86, "elapsed_time": "0:07:45", "remaining_time": "1:30:56"} +{"current_steps": 148, "total_steps": 1870, "loss": 0.5964, "lr": 4.9241504815371346e-06, "epoch": 0.7914438502673797, "percentage": 7.91, "elapsed_time": "0:07:46", "remaining_time": "1:30:29"} +{"current_steps": 149, "total_steps": 1870, "loss": 0.7615, "lr": 4.923120345055887e-06, "epoch": 0.7967914438502673, "percentage": 7.97, "elapsed_time": "0:07:49", "remaining_time": "1:30:28"} +{"current_steps": 150, "total_steps": 1870, "loss": 0.6908, "lr": 4.922083369593372e-06, "epoch": 0.8021390374331551, "percentage": 8.02, "elapsed_time": "0:07:51", "remaining_time": "1:30:09"} +{"current_steps": 151, "total_steps": 1870, "loss": 0.8661, "lr": 4.921039558076335e-06, "epoch": 0.8074866310160428, "percentage": 8.07, "elapsed_time": "0:07:56", "remaining_time": "1:30:24"} +{"current_steps": 152, "total_steps": 1870, "loss": 0.5267, "lr": 4.919988913450812e-06, "epoch": 0.8128342245989305, "percentage": 8.13, "elapsed_time": "0:07:59", "remaining_time": "1:30:18"} +{"current_steps": 153, "total_steps": 1870, "loss": 0.9222, "lr": 4.918931438682132e-06, "epoch": 0.8181818181818182, "percentage": 8.18, "elapsed_time": "0:08:02", "remaining_time": "1:30:16"} +{"current_steps": 154, "total_steps": 1870, "loss": 0.8865, "lr": 4.917867136754894e-06, "epoch": 0.8235294117647058, "percentage": 8.24, "elapsed_time": "0:08:04", "remaining_time": "1:29:55"} +{"current_steps": 155, "total_steps": 1870, "loss": 0.7262, "lr": 4.916796010672969e-06, "epoch": 0.8288770053475936, "percentage": 8.29, "elapsed_time": "0:08:07", "remaining_time": "1:29:57"} +{"current_steps": 156, "total_steps": 1870, "loss": 0.7611, "lr": 4.91571806345949e-06, "epoch": 0.8342245989304813, "percentage": 8.34, "elapsed_time": "0:08:10", "remaining_time": "1:29:44"} +{"current_steps": 157, "total_steps": 1870, "loss": 0.8745, "lr": 4.91463329815684e-06, "epoch": 0.839572192513369, "percentage": 8.4, "elapsed_time": "0:08:11", "remaining_time": "1:29:26"} +{"current_steps": 158, "total_steps": 1870, "loss": 0.6164, "lr": 4.913541717826645e-06, "epoch": 0.8449197860962567, "percentage": 8.45, "elapsed_time": "0:08:14", "remaining_time": "1:29:19"} +{"current_steps": 159, "total_steps": 1870, "loss": 0.5549, "lr": 4.912443325549767e-06, "epoch": 0.8502673796791443, "percentage": 8.5, "elapsed_time": "0:08:18", "remaining_time": "1:29:25"} +{"current_steps": 160, "total_steps": 1870, "loss": 0.9052, "lr": 4.911338124426291e-06, "epoch": 0.8556149732620321, "percentage": 8.56, "elapsed_time": "0:08:21", "remaining_time": "1:29:23"} +{"current_steps": 161, "total_steps": 1870, "loss": 0.7989, "lr": 4.910226117575525e-06, "epoch": 0.8609625668449198, "percentage": 8.61, "elapsed_time": "0:08:26", "remaining_time": "1:29:35"} +{"current_steps": 162, "total_steps": 1870, "loss": 0.5915, "lr": 4.909107308135978e-06, "epoch": 0.8663101604278075, "percentage": 8.66, "elapsed_time": "0:08:28", "remaining_time": "1:29:22"} +{"current_steps": 163, "total_steps": 1870, "loss": 0.6593, "lr": 4.907981699265364e-06, "epoch": 0.8716577540106952, "percentage": 8.72, "elapsed_time": "0:08:30", "remaining_time": "1:29:02"} +{"current_steps": 164, "total_steps": 1870, "loss": 0.8739, "lr": 4.906849294140587e-06, "epoch": 0.8770053475935828, "percentage": 8.77, "elapsed_time": "0:08:33", "remaining_time": "1:29:00"} +{"current_steps": 165, "total_steps": 1870, "loss": 0.7314, "lr": 4.9057100959577285e-06, "epoch": 0.8823529411764706, "percentage": 8.82, "elapsed_time": "0:08:37", "remaining_time": "1:29:12"} +{"current_steps": 166, "total_steps": 1870, "loss": 1.0109, "lr": 4.904564107932048e-06, "epoch": 0.8877005347593583, "percentage": 8.88, "elapsed_time": "0:08:40", "remaining_time": "1:28:59"} +{"current_steps": 167, "total_steps": 1870, "loss": 0.9092, "lr": 4.903411333297966e-06, "epoch": 0.893048128342246, "percentage": 8.93, "elapsed_time": "0:08:43", "remaining_time": "1:28:54"} +{"current_steps": 168, "total_steps": 1870, "loss": 0.7922, "lr": 4.902251775309057e-06, "epoch": 0.8983957219251337, "percentage": 8.98, "elapsed_time": "0:08:46", "remaining_time": "1:28:53"} +{"current_steps": 169, "total_steps": 1870, "loss": 0.5955, "lr": 4.901085437238041e-06, "epoch": 0.9037433155080213, "percentage": 9.04, "elapsed_time": "0:08:52", "remaining_time": "1:29:17"} +{"current_steps": 170, "total_steps": 1870, "loss": 1.0019, "lr": 4.899912322376776e-06, "epoch": 0.9090909090909091, "percentage": 9.09, "elapsed_time": "0:08:54", "remaining_time": "1:29:01"} +{"current_steps": 171, "total_steps": 1870, "loss": 0.8508, "lr": 4.8987324340362445e-06, "epoch": 0.9144385026737968, "percentage": 9.14, "elapsed_time": "0:08:58", "remaining_time": "1:29:10"} +{"current_steps": 172, "total_steps": 1870, "loss": 0.8514, "lr": 4.897545775546545e-06, "epoch": 0.9197860962566845, "percentage": 9.2, "elapsed_time": "0:08:59", "remaining_time": "1:28:48"} +{"current_steps": 173, "total_steps": 1870, "loss": 1.0263, "lr": 4.8963523502568886e-06, "epoch": 0.9251336898395722, "percentage": 9.25, "elapsed_time": "0:09:03", "remaining_time": "1:28:46"} +{"current_steps": 174, "total_steps": 1870, "loss": 0.7929, "lr": 4.895152161535582e-06, "epoch": 0.93048128342246, "percentage": 9.3, "elapsed_time": "0:09:04", "remaining_time": "1:28:30"} +{"current_steps": 175, "total_steps": 1870, "loss": 0.7227, "lr": 4.893945212770019e-06, "epoch": 0.9358288770053476, "percentage": 9.36, "elapsed_time": "0:09:08", "remaining_time": "1:28:30"} +{"current_steps": 176, "total_steps": 1870, "loss": 0.8923, "lr": 4.892731507366678e-06, "epoch": 0.9411764705882353, "percentage": 9.41, "elapsed_time": "0:09:12", "remaining_time": "1:28:34"} +{"current_steps": 177, "total_steps": 1870, "loss": 0.7475, "lr": 4.891511048751102e-06, "epoch": 0.946524064171123, "percentage": 9.47, "elapsed_time": "0:09:15", "remaining_time": "1:28:30"} +{"current_steps": 178, "total_steps": 1870, "loss": 1.1405, "lr": 4.890283840367898e-06, "epoch": 0.9518716577540107, "percentage": 9.52, "elapsed_time": "0:09:20", "remaining_time": "1:28:51"} +{"current_steps": 179, "total_steps": 1870, "loss": 0.8524, "lr": 4.889049885680721e-06, "epoch": 0.9572192513368984, "percentage": 9.57, "elapsed_time": "0:09:24", "remaining_time": "1:28:49"} +{"current_steps": 180, "total_steps": 1870, "loss": 0.7617, "lr": 4.887809188172268e-06, "epoch": 0.9625668449197861, "percentage": 9.63, "elapsed_time": "0:09:25", "remaining_time": "1:28:32"} +{"current_steps": 181, "total_steps": 1870, "loss": 0.8514, "lr": 4.886561751344266e-06, "epoch": 0.9679144385026738, "percentage": 9.68, "elapsed_time": "0:09:27", "remaining_time": "1:28:18"} +{"current_steps": 182, "total_steps": 1870, "loss": 0.8335, "lr": 4.885307578717464e-06, "epoch": 0.9732620320855615, "percentage": 9.73, "elapsed_time": "0:09:33", "remaining_time": "1:28:38"} +{"current_steps": 183, "total_steps": 1870, "loss": 0.831, "lr": 4.8840466738316216e-06, "epoch": 0.9786096256684492, "percentage": 9.79, "elapsed_time": "0:09:37", "remaining_time": "1:28:43"} +{"current_steps": 184, "total_steps": 1870, "loss": 0.7891, "lr": 4.882779040245499e-06, "epoch": 0.983957219251337, "percentage": 9.84, "elapsed_time": "0:09:39", "remaining_time": "1:28:28"} +{"current_steps": 185, "total_steps": 1870, "loss": 0.6257, "lr": 4.881504681536847e-06, "epoch": 0.9893048128342246, "percentage": 9.89, "elapsed_time": "0:09:44", "remaining_time": "1:28:40"} +{"current_steps": 186, "total_steps": 1870, "loss": 0.6008, "lr": 4.880223601302398e-06, "epoch": 0.9946524064171123, "percentage": 9.95, "elapsed_time": "0:09:48", "remaining_time": "1:28:50"} +{"current_steps": 187, "total_steps": 1870, "loss": 0.6061, "lr": 4.878935803157856e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:09:50", "remaining_time": "1:28:35"} +{"current_steps": 188, "total_steps": 1870, "loss": 0.6628, "lr": 4.8776412907378845e-06, "epoch": 1.0053475935828877, "percentage": 10.05, "elapsed_time": "1:48:13", "remaining_time": "16:08:15"} +{"current_steps": 189, "total_steps": 1870, "loss": 0.6124, "lr": 4.876340067696097e-06, "epoch": 1.0106951871657754, "percentage": 10.11, "elapsed_time": "1:48:19", "remaining_time": "16:03:27"} +{"current_steps": 190, "total_steps": 1870, "loss": 0.7186, "lr": 4.875032137705047e-06, "epoch": 1.0160427807486632, "percentage": 10.16, "elapsed_time": "1:48:22", "remaining_time": "15:58:17"} +{"current_steps": 191, "total_steps": 1870, "loss": 0.7723, "lr": 4.873717504456219e-06, "epoch": 1.0213903743315509, "percentage": 10.21, "elapsed_time": "1:48:28", "remaining_time": "15:53:34"} +{"current_steps": 192, "total_steps": 1870, "loss": 0.4069, "lr": 4.872396171660014e-06, "epoch": 1.0267379679144386, "percentage": 10.27, "elapsed_time": "1:48:31", "remaining_time": "15:48:26"} +{"current_steps": 193, "total_steps": 1870, "loss": 0.6705, "lr": 4.8710681430457466e-06, "epoch": 1.032085561497326, "percentage": 10.32, "elapsed_time": "1:48:37", "remaining_time": "15:43:47"} +{"current_steps": 194, "total_steps": 1870, "loss": 0.8276, "lr": 4.8697334223616226e-06, "epoch": 1.0374331550802138, "percentage": 10.37, "elapsed_time": "1:48:41", "remaining_time": "15:38:56"} +{"current_steps": 195, "total_steps": 1870, "loss": 0.5913, "lr": 4.8683920133747405e-06, "epoch": 1.0427807486631016, "percentage": 10.43, "elapsed_time": "1:48:43", "remaining_time": "15:33:54"} +{"current_steps": 196, "total_steps": 1870, "loss": 0.5244, "lr": 4.867043919871076e-06, "epoch": 1.0481283422459893, "percentage": 10.48, "elapsed_time": "1:48:45", "remaining_time": "15:28:55"} +{"current_steps": 197, "total_steps": 1870, "loss": 0.5962, "lr": 4.865689145655467e-06, "epoch": 1.053475935828877, "percentage": 10.53, "elapsed_time": "1:48:51", "remaining_time": "15:24:28"} +{"current_steps": 198, "total_steps": 1870, "loss": 0.5601, "lr": 4.864327694551612e-06, "epoch": 1.0588235294117647, "percentage": 10.59, "elapsed_time": "1:48:53", "remaining_time": "15:19:33"} +{"current_steps": 199, "total_steps": 1870, "loss": 0.953, "lr": 4.86295957040205e-06, "epoch": 1.0641711229946524, "percentage": 10.64, "elapsed_time": "1:48:55", "remaining_time": "15:14:37"} +{"current_steps": 200, "total_steps": 1870, "loss": 0.7394, "lr": 4.861584777068154e-06, "epoch": 1.0695187165775402, "percentage": 10.7, "elapsed_time": "1:49:01", "remaining_time": "15:10:24"} +{"current_steps": 201, "total_steps": 1870, "loss": 0.3851, "lr": 4.860203318430126e-06, "epoch": 1.0748663101604279, "percentage": 10.75, "elapsed_time": "1:49:03", "remaining_time": "15:05:30"} +{"current_steps": 202, "total_steps": 1870, "loss": 0.6595, "lr": 4.858815198386973e-06, "epoch": 1.0802139037433156, "percentage": 10.8, "elapsed_time": "1:49:05", "remaining_time": "15:00:46"} +{"current_steps": 203, "total_steps": 1870, "loss": 0.5308, "lr": 4.8574204208565056e-06, "epoch": 1.085561497326203, "percentage": 10.86, "elapsed_time": "1:49:07", "remaining_time": "14:56:03"} +{"current_steps": 204, "total_steps": 1870, "loss": 0.5401, "lr": 4.856018989775326e-06, "epoch": 1.0909090909090908, "percentage": 10.91, "elapsed_time": "1:49:10", "remaining_time": "14:51:38"} +{"current_steps": 205, "total_steps": 1870, "loss": 0.6365, "lr": 4.854610909098813e-06, "epoch": 1.0962566844919786, "percentage": 10.96, "elapsed_time": "1:49:12", "remaining_time": "14:47:02"} +{"current_steps": 206, "total_steps": 1870, "loss": 1.0949, "lr": 4.853196182801112e-06, "epoch": 1.1016042780748663, "percentage": 11.02, "elapsed_time": "1:49:14", "remaining_time": "14:42:28"} +{"current_steps": 207, "total_steps": 1870, "loss": 0.6629, "lr": 4.851774814875131e-06, "epoch": 1.106951871657754, "percentage": 11.07, "elapsed_time": "1:49:21", "remaining_time": "14:38:32"} +{"current_steps": 208, "total_steps": 1870, "loss": 0.7166, "lr": 4.850346809332515e-06, "epoch": 1.1122994652406417, "percentage": 11.12, "elapsed_time": "1:49:23", "remaining_time": "14:34:03"} +{"current_steps": 209, "total_steps": 1870, "loss": 0.7077, "lr": 4.8489121702036515e-06, "epoch": 1.1176470588235294, "percentage": 11.18, "elapsed_time": "1:49:28", "remaining_time": "14:30:02"} +{"current_steps": 210, "total_steps": 1870, "loss": 0.6319, "lr": 4.847470901537642e-06, "epoch": 1.1229946524064172, "percentage": 11.23, "elapsed_time": "1:49:34", "remaining_time": "14:26:13"} +{"current_steps": 211, "total_steps": 1870, "loss": 0.5327, "lr": 4.846023007402305e-06, "epoch": 1.1283422459893049, "percentage": 11.28, "elapsed_time": "1:49:36", "remaining_time": "14:21:49"} +{"current_steps": 212, "total_steps": 1870, "loss": 0.414, "lr": 4.844568491884156e-06, "epoch": 1.1336898395721926, "percentage": 11.34, "elapsed_time": "1:49:38", "remaining_time": "14:17:24"} +{"current_steps": 213, "total_steps": 1870, "loss": 0.5933, "lr": 4.843107359088402e-06, "epoch": 1.1390374331550803, "percentage": 11.39, "elapsed_time": "1:49:40", "remaining_time": "14:13:09"} +{"current_steps": 214, "total_steps": 1870, "loss": 0.6844, "lr": 4.84163961313892e-06, "epoch": 1.1443850267379678, "percentage": 11.44, "elapsed_time": "1:49:44", "remaining_time": "14:09:13"} +{"current_steps": 215, "total_steps": 1870, "loss": 0.5242, "lr": 4.840165258178259e-06, "epoch": 1.1497326203208555, "percentage": 11.5, "elapsed_time": "1:49:46", "remaining_time": "14:05:03"} +{"current_steps": 216, "total_steps": 1870, "loss": 0.747, "lr": 4.838684298367616e-06, "epoch": 1.1550802139037433, "percentage": 11.55, "elapsed_time": "1:49:48", "remaining_time": "14:00:50"} +{"current_steps": 217, "total_steps": 1870, "loss": 0.7602, "lr": 4.837196737886834e-06, "epoch": 1.160427807486631, "percentage": 11.6, "elapsed_time": "1:49:51", "remaining_time": "13:56:47"} +{"current_steps": 218, "total_steps": 1870, "loss": 0.7525, "lr": 4.83570258093438e-06, "epoch": 1.1657754010695187, "percentage": 11.66, "elapsed_time": "1:49:55", "remaining_time": "13:53:00"} +{"current_steps": 219, "total_steps": 1870, "loss": 0.5111, "lr": 4.834201831727343e-06, "epoch": 1.1711229946524064, "percentage": 11.71, "elapsed_time": "1:50:02", "remaining_time": "13:49:36"} +{"current_steps": 220, "total_steps": 1870, "loss": 0.6215, "lr": 4.832694494501417e-06, "epoch": 1.1764705882352942, "percentage": 11.76, "elapsed_time": "1:50:04", "remaining_time": "13:45:32"} +{"current_steps": 221, "total_steps": 1870, "loss": 0.3931, "lr": 4.83118057351089e-06, "epoch": 1.1818181818181819, "percentage": 11.82, "elapsed_time": "1:50:05", "remaining_time": "13:41:28"} +{"current_steps": 222, "total_steps": 1870, "loss": 0.6418, "lr": 4.829660073028631e-06, "epoch": 1.1871657754010696, "percentage": 11.87, "elapsed_time": "1:50:09", "remaining_time": "13:37:41"} +{"current_steps": 223, "total_steps": 1870, "loss": 0.5028, "lr": 4.82813299734608e-06, "epoch": 1.192513368983957, "percentage": 11.93, "elapsed_time": "1:50:11", "remaining_time": "13:33:52"} +{"current_steps": 224, "total_steps": 1870, "loss": 0.4452, "lr": 4.826599350773234e-06, "epoch": 1.1978609625668448, "percentage": 11.98, "elapsed_time": "1:50:14", "remaining_time": "13:30:01"} +{"current_steps": 225, "total_steps": 1870, "loss": 0.7803, "lr": 4.825059137638636e-06, "epoch": 1.2032085561497325, "percentage": 12.03, "elapsed_time": "1:50:17", "remaining_time": "13:26:24"} +{"current_steps": 226, "total_steps": 1870, "loss": 0.5968, "lr": 4.823512362289362e-06, "epoch": 1.2085561497326203, "percentage": 12.09, "elapsed_time": "1:50:20", "remaining_time": "13:22:36"} +{"current_steps": 227, "total_steps": 1870, "loss": 0.5724, "lr": 4.821959029091009e-06, "epoch": 1.213903743315508, "percentage": 12.14, "elapsed_time": "1:50:25", "remaining_time": "13:19:13"} +{"current_steps": 228, "total_steps": 1870, "loss": 0.5357, "lr": 4.820399142427684e-06, "epoch": 1.2192513368983957, "percentage": 12.19, "elapsed_time": "1:50:27", "remaining_time": "13:15:29"} +{"current_steps": 229, "total_steps": 1870, "loss": 0.5743, "lr": 4.818832706701989e-06, "epoch": 1.2245989304812834, "percentage": 12.25, "elapsed_time": "1:50:31", "remaining_time": "13:11:58"} +{"current_steps": 230, "total_steps": 1870, "loss": 0.8447, "lr": 4.817259726335009e-06, "epoch": 1.2299465240641712, "percentage": 12.3, "elapsed_time": "1:50:32", "remaining_time": "13:08:14"} +{"current_steps": 231, "total_steps": 1870, "loss": 0.8136, "lr": 4.815680205766304e-06, "epoch": 1.2352941176470589, "percentage": 12.35, "elapsed_time": "1:50:40", "remaining_time": "13:05:13"} +{"current_steps": 232, "total_steps": 1870, "loss": 0.4073, "lr": 4.814094149453891e-06, "epoch": 1.2406417112299466, "percentage": 12.41, "elapsed_time": "1:50:42", "remaining_time": "13:01:37"} +{"current_steps": 233, "total_steps": 1870, "loss": 0.6625, "lr": 4.812501561874232e-06, "epoch": 1.2459893048128343, "percentage": 12.46, "elapsed_time": "1:50:45", "remaining_time": "12:58:08"} +{"current_steps": 234, "total_steps": 1870, "loss": 0.6403, "lr": 4.8109024475222255e-06, "epoch": 1.251336898395722, "percentage": 12.51, "elapsed_time": "1:50:49", "remaining_time": "12:54:47"} +{"current_steps": 235, "total_steps": 1870, "loss": 0.8436, "lr": 4.809296810911188e-06, "epoch": 1.2566844919786098, "percentage": 12.57, "elapsed_time": "1:50:51", "remaining_time": "12:51:15"} +{"current_steps": 236, "total_steps": 1870, "loss": 0.8578, "lr": 4.8076846565728475e-06, "epoch": 1.2620320855614973, "percentage": 12.62, "elapsed_time": "1:50:57", "remaining_time": "12:48:16"} +{"current_steps": 237, "total_steps": 1870, "loss": 0.5431, "lr": 4.806065989057326e-06, "epoch": 1.267379679144385, "percentage": 12.67, "elapsed_time": "1:51:01", "remaining_time": "12:44:58"} +{"current_steps": 238, "total_steps": 1870, "loss": 0.4613, "lr": 4.8044408129331266e-06, "epoch": 1.2727272727272727, "percentage": 12.73, "elapsed_time": "1:51:03", "remaining_time": "12:41:32"} +{"current_steps": 239, "total_steps": 1870, "loss": 0.6743, "lr": 4.802809132787125e-06, "epoch": 1.2780748663101604, "percentage": 12.78, "elapsed_time": "1:51:05", "remaining_time": "12:38:08"} +{"current_steps": 240, "total_steps": 1870, "loss": 0.8116, "lr": 4.801170953224554e-06, "epoch": 1.2834224598930482, "percentage": 12.83, "elapsed_time": "1:51:07", "remaining_time": "12:34:42"} +{"current_steps": 241, "total_steps": 1870, "loss": 0.4008, "lr": 4.7995262788689865e-06, "epoch": 1.2887700534759359, "percentage": 12.89, "elapsed_time": "1:51:11", "remaining_time": "12:31:31"} +{"current_steps": 242, "total_steps": 1870, "loss": 0.5953, "lr": 4.797875114362331e-06, "epoch": 1.2941176470588236, "percentage": 12.94, "elapsed_time": "1:51:12", "remaining_time": "12:28:07"} +{"current_steps": 243, "total_steps": 1870, "loss": 0.8779, "lr": 4.796217464364808e-06, "epoch": 1.299465240641711, "percentage": 12.99, "elapsed_time": "1:51:15", "remaining_time": "12:24:55"} +{"current_steps": 244, "total_steps": 1870, "loss": 0.7568, "lr": 4.794553333554949e-06, "epoch": 1.3048128342245988, "percentage": 13.05, "elapsed_time": "1:51:18", "remaining_time": "12:21:47"} +{"current_steps": 245, "total_steps": 1870, "loss": 0.5016, "lr": 4.792882726629572e-06, "epoch": 1.3101604278074865, "percentage": 13.1, "elapsed_time": "1:51:22", "remaining_time": "12:18:45"} +{"current_steps": 246, "total_steps": 1870, "loss": 0.8415, "lr": 4.791205648303775e-06, "epoch": 1.3155080213903743, "percentage": 13.16, "elapsed_time": "1:51:27", "remaining_time": "12:15:45"} +{"current_steps": 247, "total_steps": 1870, "loss": 0.6032, "lr": 4.789522103310922e-06, "epoch": 1.320855614973262, "percentage": 13.21, "elapsed_time": "1:51:31", "remaining_time": "12:12:50"} +{"current_steps": 248, "total_steps": 1870, "loss": 0.6548, "lr": 4.787832096402626e-06, "epoch": 1.3262032085561497, "percentage": 13.26, "elapsed_time": "1:51:36", "remaining_time": "12:09:57"} +{"current_steps": 249, "total_steps": 1870, "loss": 0.6212, "lr": 4.786135632348738e-06, "epoch": 1.3315508021390374, "percentage": 13.32, "elapsed_time": "1:51:39", "remaining_time": "12:06:56"} +{"current_steps": 250, "total_steps": 1870, "loss": 0.8052, "lr": 4.7844327159373365e-06, "epoch": 1.3368983957219251, "percentage": 13.37, "elapsed_time": "1:51:43", "remaining_time": "12:03:55"} +{"current_steps": 251, "total_steps": 1870, "loss": 0.589, "lr": 4.782723351974708e-06, "epoch": 1.3422459893048129, "percentage": 13.42, "elapsed_time": "1:51:46", "remaining_time": "12:00:57"} +{"current_steps": 252, "total_steps": 1870, "loss": 0.63, "lr": 4.7810075452853385e-06, "epoch": 1.3475935828877006, "percentage": 13.48, "elapsed_time": "1:51:49", "remaining_time": "11:57:58"} +{"current_steps": 253, "total_steps": 1870, "loss": 0.6727, "lr": 4.779285300711897e-06, "epoch": 1.3529411764705883, "percentage": 13.53, "elapsed_time": "1:51:53", "remaining_time": "11:55:09"} +{"current_steps": 254, "total_steps": 1870, "loss": 0.4158, "lr": 4.7775566231152216e-06, "epoch": 1.358288770053476, "percentage": 13.58, "elapsed_time": "1:51:55", "remaining_time": "11:52:03"} +{"current_steps": 255, "total_steps": 1870, "loss": 0.8809, "lr": 4.775821517374308e-06, "epoch": 1.3636363636363638, "percentage": 13.64, "elapsed_time": "1:51:56", "remaining_time": "11:48:58"} +{"current_steps": 256, "total_steps": 1870, "loss": 0.5157, "lr": 4.7740799883862966e-06, "epoch": 1.3689839572192513, "percentage": 13.69, "elapsed_time": "1:51:58", "remaining_time": "11:45:55"} +{"current_steps": 257, "total_steps": 1870, "loss": 0.4467, "lr": 4.772332041066452e-06, "epoch": 1.374331550802139, "percentage": 13.74, "elapsed_time": "1:52:00", "remaining_time": "11:42:59"} +{"current_steps": 258, "total_steps": 1870, "loss": 0.9125, "lr": 4.770577680348159e-06, "epoch": 1.3796791443850267, "percentage": 13.8, "elapsed_time": "1:52:02", "remaining_time": "11:40:01"} +{"current_steps": 259, "total_steps": 1870, "loss": 0.4665, "lr": 4.768816911182899e-06, "epoch": 1.3850267379679144, "percentage": 13.85, "elapsed_time": "1:52:03", "remaining_time": "11:37:03"} +{"current_steps": 260, "total_steps": 1870, "loss": 0.5404, "lr": 4.767049738540244e-06, "epoch": 1.3903743315508021, "percentage": 13.9, "elapsed_time": "1:52:05", "remaining_time": "11:34:09"} +{"current_steps": 261, "total_steps": 1870, "loss": 0.4575, "lr": 4.765276167407836e-06, "epoch": 1.3957219251336899, "percentage": 13.96, "elapsed_time": "1:52:11", "remaining_time": "11:31:37"} +{"current_steps": 262, "total_steps": 1870, "loss": 0.8227, "lr": 4.7634962027913784e-06, "epoch": 1.4010695187165776, "percentage": 14.01, "elapsed_time": "1:52:18", "remaining_time": "11:29:16"} +{"current_steps": 263, "total_steps": 1870, "loss": 0.5813, "lr": 4.761709849714619e-06, "epoch": 1.4064171122994653, "percentage": 14.06, "elapsed_time": "1:52:20", "remaining_time": "11:26:23"} +{"current_steps": 264, "total_steps": 1870, "loss": 0.6333, "lr": 4.7599171132193355e-06, "epoch": 1.4117647058823528, "percentage": 14.12, "elapsed_time": "1:52:22", "remaining_time": "11:23:35"} +{"current_steps": 265, "total_steps": 1870, "loss": 0.6368, "lr": 4.7581179983653224e-06, "epoch": 1.4171122994652405, "percentage": 14.17, "elapsed_time": "1:52:26", "remaining_time": "11:21:01"} +{"current_steps": 266, "total_steps": 1870, "loss": 0.4146, "lr": 4.756312510230377e-06, "epoch": 1.4224598930481283, "percentage": 14.22, "elapsed_time": "1:52:27", "remaining_time": "11:18:09"} +{"current_steps": 267, "total_steps": 1870, "loss": 0.6066, "lr": 4.754500653910284e-06, "epoch": 1.427807486631016, "percentage": 14.28, "elapsed_time": "1:52:29", "remaining_time": "11:15:24"} +{"current_steps": 268, "total_steps": 1870, "loss": 0.6254, "lr": 4.752682434518801e-06, "epoch": 1.4331550802139037, "percentage": 14.33, "elapsed_time": "1:52:31", "remaining_time": "11:12:37"} +{"current_steps": 269, "total_steps": 1870, "loss": 0.4853, "lr": 4.750857857187645e-06, "epoch": 1.4385026737967914, "percentage": 14.39, "elapsed_time": "1:52:33", "remaining_time": "11:09:54"} +{"current_steps": 270, "total_steps": 1870, "loss": 0.7066, "lr": 4.749026927066479e-06, "epoch": 1.4438502673796791, "percentage": 14.44, "elapsed_time": "1:52:37", "remaining_time": "11:07:25"} +{"current_steps": 271, "total_steps": 1870, "loss": 0.5224, "lr": 4.747189649322894e-06, "epoch": 1.4491978609625669, "percentage": 14.49, "elapsed_time": "1:52:41", "remaining_time": "11:04:55"} +{"current_steps": 272, "total_steps": 1870, "loss": 0.7391, "lr": 4.745346029142397e-06, "epoch": 1.4545454545454546, "percentage": 14.55, "elapsed_time": "1:52:44", "remaining_time": "11:02:23"} +{"current_steps": 273, "total_steps": 1870, "loss": 0.6529, "lr": 4.743496071728396e-06, "epoch": 1.4598930481283423, "percentage": 14.6, "elapsed_time": "1:52:48", "remaining_time": "10:59:57"} +{"current_steps": 274, "total_steps": 1870, "loss": 0.453, "lr": 4.741639782302187e-06, "epoch": 1.46524064171123, "percentage": 14.65, "elapsed_time": "1:52:52", "remaining_time": "10:57:25"} +{"current_steps": 275, "total_steps": 1870, "loss": 0.5275, "lr": 4.739777166102933e-06, "epoch": 1.4705882352941178, "percentage": 14.71, "elapsed_time": "1:52:53", "remaining_time": "10:54:45"} +{"current_steps": 276, "total_steps": 1870, "loss": 0.5838, "lr": 4.737908228387656e-06, "epoch": 1.4759358288770055, "percentage": 14.76, "elapsed_time": "1:52:57", "remaining_time": "10:52:21"} +{"current_steps": 277, "total_steps": 1870, "loss": 0.5719, "lr": 4.736032974431222e-06, "epoch": 1.481283422459893, "percentage": 14.81, "elapsed_time": "1:53:04", "remaining_time": "10:50:18"} +{"current_steps": 278, "total_steps": 1870, "loss": 0.4318, "lr": 4.7341514095263214e-06, "epoch": 1.4866310160427807, "percentage": 14.87, "elapsed_time": "1:53:07", "remaining_time": "10:47:47"} +{"current_steps": 279, "total_steps": 1870, "loss": 0.6388, "lr": 4.732263538983456e-06, "epoch": 1.4919786096256684, "percentage": 14.92, "elapsed_time": "1:53:08", "remaining_time": "10:45:10"} +{"current_steps": 280, "total_steps": 1870, "loss": 0.6673, "lr": 4.730369368130925e-06, "epoch": 1.4973262032085561, "percentage": 14.97, "elapsed_time": "1:53:09", "remaining_time": "10:42:34"} +{"current_steps": 281, "total_steps": 1870, "loss": 1.2311, "lr": 4.728468902314811e-06, "epoch": 1.5026737967914439, "percentage": 15.03, "elapsed_time": "1:53:13", "remaining_time": "10:40:13"} +{"current_steps": 282, "total_steps": 1870, "loss": 0.6467, "lr": 4.726562146898963e-06, "epoch": 1.5080213903743316, "percentage": 15.08, "elapsed_time": "1:53:15", "remaining_time": "10:37:47"} +{"current_steps": 283, "total_steps": 1870, "loss": 0.6265, "lr": 4.72464910726498e-06, "epoch": 1.5133689839572193, "percentage": 15.13, "elapsed_time": "1:53:22", "remaining_time": "10:35:45"} +{"current_steps": 284, "total_steps": 1870, "loss": 0.8415, "lr": 4.7227297888121985e-06, "epoch": 1.5187165775401068, "percentage": 15.19, "elapsed_time": "1:53:25", "remaining_time": "10:33:25"} +{"current_steps": 285, "total_steps": 1870, "loss": 0.6441, "lr": 4.720804196957676e-06, "epoch": 1.5240641711229945, "percentage": 15.24, "elapsed_time": "1:53:27", "remaining_time": "10:30:57"} +{"current_steps": 286, "total_steps": 1870, "loss": 0.8297, "lr": 4.718872337136176e-06, "epoch": 1.5294117647058822, "percentage": 15.29, "elapsed_time": "1:53:28", "remaining_time": "10:28:29"} +{"current_steps": 287, "total_steps": 1870, "loss": 0.9988, "lr": 4.716934214800155e-06, "epoch": 1.53475935828877, "percentage": 15.35, "elapsed_time": "1:53:31", "remaining_time": "10:26:09"} +{"current_steps": 288, "total_steps": 1870, "loss": 0.5931, "lr": 4.714989835419741e-06, "epoch": 1.5401069518716577, "percentage": 15.4, "elapsed_time": "1:53:35", "remaining_time": "10:23:59"} +{"current_steps": 289, "total_steps": 1870, "loss": 0.5902, "lr": 4.713039204482723e-06, "epoch": 1.5454545454545454, "percentage": 15.45, "elapsed_time": "1:53:37", "remaining_time": "10:21:34"} +{"current_steps": 290, "total_steps": 1870, "loss": 0.7356, "lr": 4.711082327494536e-06, "epoch": 1.5508021390374331, "percentage": 15.51, "elapsed_time": "1:53:38", "remaining_time": "10:19:08"} +{"current_steps": 291, "total_steps": 1870, "loss": 0.529, "lr": 4.709119209978242e-06, "epoch": 1.5561497326203209, "percentage": 15.56, "elapsed_time": "1:53:40", "remaining_time": "10:16:46"} +{"current_steps": 292, "total_steps": 1870, "loss": 0.4536, "lr": 4.707149857474516e-06, "epoch": 1.5614973262032086, "percentage": 15.61, "elapsed_time": "1:53:41", "remaining_time": "10:14:25"} +{"current_steps": 293, "total_steps": 1870, "loss": 0.5565, "lr": 4.705174275541632e-06, "epoch": 1.5668449197860963, "percentage": 15.67, "elapsed_time": "1:53:45", "remaining_time": "10:12:15"} +{"current_steps": 294, "total_steps": 1870, "loss": 0.728, "lr": 4.703192469755444e-06, "epoch": 1.572192513368984, "percentage": 15.72, "elapsed_time": "1:53:50", "remaining_time": "10:10:12"} +{"current_steps": 295, "total_steps": 1870, "loss": 0.6269, "lr": 4.701204445709375e-06, "epoch": 1.5775401069518717, "percentage": 15.78, "elapsed_time": "1:53:56", "remaining_time": "10:08:18"} +{"current_steps": 296, "total_steps": 1870, "loss": 0.658, "lr": 4.699210209014394e-06, "epoch": 1.5828877005347595, "percentage": 15.83, "elapsed_time": "1:53:58", "remaining_time": "10:06:05"} +{"current_steps": 297, "total_steps": 1870, "loss": 0.5184, "lr": 4.69720976529901e-06, "epoch": 1.5882352941176472, "percentage": 15.88, "elapsed_time": "1:54:02", "remaining_time": "10:03:57"} +{"current_steps": 298, "total_steps": 1870, "loss": 0.5321, "lr": 4.695203120209245e-06, "epoch": 1.593582887700535, "percentage": 15.94, "elapsed_time": "1:54:03", "remaining_time": "10:01:40"} +{"current_steps": 299, "total_steps": 1870, "loss": 0.4647, "lr": 4.693190279408628e-06, "epoch": 1.5989304812834224, "percentage": 15.99, "elapsed_time": "1:54:07", "remaining_time": "9:59:39"} +{"current_steps": 300, "total_steps": 1870, "loss": 0.4889, "lr": 4.691171248578172e-06, "epoch": 1.6042780748663101, "percentage": 16.04, "elapsed_time": "1:54:10", "remaining_time": "9:57:31"} +{"current_steps": 301, "total_steps": 1870, "loss": 0.6621, "lr": 4.689146033416362e-06, "epoch": 1.6096256684491979, "percentage": 16.1, "elapsed_time": "1:54:13", "remaining_time": "9:55:24"} +{"current_steps": 302, "total_steps": 1870, "loss": 0.4009, "lr": 4.687114639639136e-06, "epoch": 1.6149732620320856, "percentage": 16.15, "elapsed_time": "1:54:16", "remaining_time": "9:53:21"} +{"current_steps": 303, "total_steps": 1870, "loss": 0.5065, "lr": 4.685077072979874e-06, "epoch": 1.6203208556149733, "percentage": 16.2, "elapsed_time": "1:54:18", "remaining_time": "9:51:09"} +{"current_steps": 304, "total_steps": 1870, "loss": 0.5289, "lr": 4.683033339189375e-06, "epoch": 1.6256684491978608, "percentage": 16.26, "elapsed_time": "1:54:20", "remaining_time": "9:49:01"} +{"current_steps": 305, "total_steps": 1870, "loss": 0.7078, "lr": 4.680983444035843e-06, "epoch": 1.6310160427807485, "percentage": 16.31, "elapsed_time": "1:54:26", "remaining_time": "9:47:11"} +{"current_steps": 306, "total_steps": 1870, "loss": 0.4003, "lr": 4.678927393304877e-06, "epoch": 1.6363636363636362, "percentage": 16.36, "elapsed_time": "1:54:27", "remaining_time": "9:44:58"} +{"current_steps": 307, "total_steps": 1870, "loss": 0.4802, "lr": 4.676865192799443e-06, "epoch": 1.641711229946524, "percentage": 16.42, "elapsed_time": "1:54:29", "remaining_time": "9:42:53"} +{"current_steps": 308, "total_steps": 1870, "loss": 0.8128, "lr": 4.6747968483398695e-06, "epoch": 1.6470588235294117, "percentage": 16.47, "elapsed_time": "1:54:30", "remaining_time": "9:40:44"} +{"current_steps": 309, "total_steps": 1870, "loss": 0.4085, "lr": 4.672722365763821e-06, "epoch": 1.6524064171122994, "percentage": 16.52, "elapsed_time": "1:54:32", "remaining_time": "9:38:40"} +{"current_steps": 310, "total_steps": 1870, "loss": 0.5707, "lr": 4.6706417509262905e-06, "epoch": 1.6577540106951871, "percentage": 16.58, "elapsed_time": "1:54:34", "remaining_time": "9:36:34"} +{"current_steps": 311, "total_steps": 1870, "loss": 0.481, "lr": 4.668555009699575e-06, "epoch": 1.6631016042780749, "percentage": 16.63, "elapsed_time": "1:54:37", "remaining_time": "9:34:36"} +{"current_steps": 312, "total_steps": 1870, "loss": 0.6021, "lr": 4.666462147973264e-06, "epoch": 1.6684491978609626, "percentage": 16.68, "elapsed_time": "1:54:39", "remaining_time": "9:32:35"} +{"current_steps": 313, "total_steps": 1870, "loss": 0.7208, "lr": 4.664363171654223e-06, "epoch": 1.6737967914438503, "percentage": 16.74, "elapsed_time": "1:54:42", "remaining_time": "9:30:38"} +{"current_steps": 314, "total_steps": 1870, "loss": 0.9136, "lr": 4.662258086666571e-06, "epoch": 1.679144385026738, "percentage": 16.79, "elapsed_time": "1:54:45", "remaining_time": "9:28:40"} +{"current_steps": 315, "total_steps": 1870, "loss": 0.7375, "lr": 4.660146898951674e-06, "epoch": 1.6844919786096257, "percentage": 16.84, "elapsed_time": "1:54:49", "remaining_time": "9:26:52"} +{"current_steps": 316, "total_steps": 1870, "loss": 0.6786, "lr": 4.6580296144681155e-06, "epoch": 1.6898395721925135, "percentage": 16.9, "elapsed_time": "1:54:55", "remaining_time": "9:25:10"} +{"current_steps": 317, "total_steps": 1870, "loss": 0.789, "lr": 4.655906239191693e-06, "epoch": 1.6951871657754012, "percentage": 16.95, "elapsed_time": "1:54:57", "remaining_time": "9:23:10"} +{"current_steps": 318, "total_steps": 1870, "loss": 0.7104, "lr": 4.653776779115389e-06, "epoch": 1.700534759358289, "percentage": 17.01, "elapsed_time": "1:55:00", "remaining_time": "9:21:16"} +{"current_steps": 319, "total_steps": 1870, "loss": 0.5165, "lr": 4.651641240249364e-06, "epoch": 1.7058823529411766, "percentage": 17.06, "elapsed_time": "1:55:06", "remaining_time": "9:19:39"} +{"current_steps": 320, "total_steps": 1870, "loss": 0.4081, "lr": 4.649499628620931e-06, "epoch": 1.7112299465240641, "percentage": 17.11, "elapsed_time": "1:55:08", "remaining_time": "9:17:43"} +{"current_steps": 321, "total_steps": 1870, "loss": 0.6536, "lr": 4.647351950274548e-06, "epoch": 1.7165775401069518, "percentage": 17.17, "elapsed_time": "1:55:11", "remaining_time": "9:15:52"} +{"current_steps": 322, "total_steps": 1870, "loss": 0.6597, "lr": 4.6451982112717896e-06, "epoch": 1.7219251336898396, "percentage": 17.22, "elapsed_time": "1:55:13", "remaining_time": "9:13:57"} +{"current_steps": 323, "total_steps": 1870, "loss": 0.7608, "lr": 4.643038417691341e-06, "epoch": 1.7272727272727273, "percentage": 17.27, "elapsed_time": "1:55:18", "remaining_time": "9:12:15"} +{"current_steps": 324, "total_steps": 1870, "loss": 0.4597, "lr": 4.640872575628973e-06, "epoch": 1.732620320855615, "percentage": 17.33, "elapsed_time": "1:55:22", "remaining_time": "9:10:33"} +{"current_steps": 325, "total_steps": 1870, "loss": 0.7241, "lr": 4.6387006911975275e-06, "epoch": 1.7379679144385025, "percentage": 17.38, "elapsed_time": "1:55:27", "remaining_time": "9:08:51"} +{"current_steps": 326, "total_steps": 1870, "loss": 0.7654, "lr": 4.6365227705269026e-06, "epoch": 1.7433155080213902, "percentage": 17.43, "elapsed_time": "1:55:29", "remaining_time": "9:06:57"} +{"current_steps": 327, "total_steps": 1870, "loss": 0.6391, "lr": 4.634338819764029e-06, "epoch": 1.748663101604278, "percentage": 17.49, "elapsed_time": "1:55:34", "remaining_time": "9:05:23"} +{"current_steps": 328, "total_steps": 1870, "loss": 0.5501, "lr": 4.632148845072861e-06, "epoch": 1.7540106951871657, "percentage": 17.54, "elapsed_time": "1:55:35", "remaining_time": "9:03:26"} +{"current_steps": 329, "total_steps": 1870, "loss": 0.6117, "lr": 4.6299528526343525e-06, "epoch": 1.7593582887700534, "percentage": 17.59, "elapsed_time": "1:55:39", "remaining_time": "9:01:42"} +{"current_steps": 330, "total_steps": 1870, "loss": 0.8534, "lr": 4.627750848646443e-06, "epoch": 1.7647058823529411, "percentage": 17.65, "elapsed_time": "1:55:41", "remaining_time": "8:59:53"} +{"current_steps": 331, "total_steps": 1870, "loss": 0.6352, "lr": 4.625542839324036e-06, "epoch": 1.7700534759358288, "percentage": 17.7, "elapsed_time": "1:55:47", "remaining_time": "8:58:21"} +{"current_steps": 332, "total_steps": 1870, "loss": 0.4188, "lr": 4.6233288308989874e-06, "epoch": 1.7754010695187166, "percentage": 17.75, "elapsed_time": "1:55:53", "remaining_time": "8:56:50"} +{"current_steps": 333, "total_steps": 1870, "loss": 0.4464, "lr": 4.6211088296200834e-06, "epoch": 1.7807486631016043, "percentage": 17.81, "elapsed_time": "1:55:54", "remaining_time": "8:55:01"} +{"current_steps": 334, "total_steps": 1870, "loss": 0.6833, "lr": 4.618882841753026e-06, "epoch": 1.786096256684492, "percentage": 17.86, "elapsed_time": "1:55:57", "remaining_time": "8:53:14"} +{"current_steps": 335, "total_steps": 1870, "loss": 0.6356, "lr": 4.616650873580411e-06, "epoch": 1.7914438502673797, "percentage": 17.91, "elapsed_time": "1:56:00", "remaining_time": "8:51:31"} +{"current_steps": 336, "total_steps": 1870, "loss": 0.5413, "lr": 4.614412931401715e-06, "epoch": 1.7967914438502675, "percentage": 17.97, "elapsed_time": "1:56:03", "remaining_time": "8:49:51"} +{"current_steps": 337, "total_steps": 1870, "loss": 0.5275, "lr": 4.612169021533276e-06, "epoch": 1.8021390374331552, "percentage": 18.02, "elapsed_time": "1:56:06", "remaining_time": "8:48:12"} +{"current_steps": 338, "total_steps": 1870, "loss": 0.6292, "lr": 4.609919150308273e-06, "epoch": 1.807486631016043, "percentage": 18.07, "elapsed_time": "1:56:09", "remaining_time": "8:46:28"} +{"current_steps": 339, "total_steps": 1870, "loss": 0.5315, "lr": 4.607663324076711e-06, "epoch": 1.8128342245989306, "percentage": 18.13, "elapsed_time": "1:56:10", "remaining_time": "8:44:38"} +{"current_steps": 340, "total_steps": 1870, "loss": 0.7492, "lr": 4.605401549205404e-06, "epoch": 1.8181818181818183, "percentage": 18.18, "elapsed_time": "1:56:12", "remaining_time": "8:42:55"} +{"current_steps": 341, "total_steps": 1870, "loss": 0.6453, "lr": 4.603133832077953e-06, "epoch": 1.8235294117647058, "percentage": 18.24, "elapsed_time": "1:56:13", "remaining_time": "8:41:08"} +{"current_steps": 342, "total_steps": 1870, "loss": 0.6502, "lr": 4.600860179094732e-06, "epoch": 1.8288770053475936, "percentage": 18.29, "elapsed_time": "1:56:17", "remaining_time": "8:39:36"} +{"current_steps": 343, "total_steps": 1870, "loss": 0.6807, "lr": 4.5985805966728675e-06, "epoch": 1.8342245989304813, "percentage": 18.34, "elapsed_time": "1:56:19", "remaining_time": "8:37:49"} +{"current_steps": 344, "total_steps": 1870, "loss": 0.5235, "lr": 4.596295091246221e-06, "epoch": 1.839572192513369, "percentage": 18.4, "elapsed_time": "1:56:20", "remaining_time": "8:36:05"} +{"current_steps": 345, "total_steps": 1870, "loss": 0.5847, "lr": 4.594003669265371e-06, "epoch": 1.8449197860962567, "percentage": 18.45, "elapsed_time": "1:56:22", "remaining_time": "8:34:26"} +{"current_steps": 346, "total_steps": 1870, "loss": 0.6266, "lr": 4.591706337197597e-06, "epoch": 1.8502673796791442, "percentage": 18.5, "elapsed_time": "1:56:29", "remaining_time": "8:33:07"} +{"current_steps": 347, "total_steps": 1870, "loss": 0.5021, "lr": 4.589403101526854e-06, "epoch": 1.855614973262032, "percentage": 18.56, "elapsed_time": "1:56:32", "remaining_time": "8:31:30"} +{"current_steps": 348, "total_steps": 1870, "loss": 0.6426, "lr": 4.587093968753765e-06, "epoch": 1.8609625668449197, "percentage": 18.61, "elapsed_time": "1:56:34", "remaining_time": "8:29:53"} +{"current_steps": 349, "total_steps": 1870, "loss": 0.41, "lr": 4.584778945395594e-06, "epoch": 1.8663101604278074, "percentage": 18.66, "elapsed_time": "1:56:39", "remaining_time": "8:28:25"} +{"current_steps": 350, "total_steps": 1870, "loss": 0.6775, "lr": 4.582458037986231e-06, "epoch": 1.8716577540106951, "percentage": 18.72, "elapsed_time": "1:56:40", "remaining_time": "8:26:44"} +{"current_steps": 351, "total_steps": 1870, "loss": 0.9407, "lr": 4.580131253076171e-06, "epoch": 1.8770053475935828, "percentage": 18.77, "elapsed_time": "1:56:44", "remaining_time": "8:25:10"} +{"current_steps": 352, "total_steps": 1870, "loss": 0.6412, "lr": 4.5777985972325016e-06, "epoch": 1.8823529411764706, "percentage": 18.82, "elapsed_time": "1:56:47", "remaining_time": "8:23:38"} +{"current_steps": 353, "total_steps": 1870, "loss": 0.4353, "lr": 4.575460077038877e-06, "epoch": 1.8877005347593583, "percentage": 18.88, "elapsed_time": "1:56:49", "remaining_time": "8:22:02"} +{"current_steps": 354, "total_steps": 1870, "loss": 0.934, "lr": 4.573115699095505e-06, "epoch": 1.893048128342246, "percentage": 18.93, "elapsed_time": "1:56:52", "remaining_time": "8:20:32"} +{"current_steps": 355, "total_steps": 1870, "loss": 0.472, "lr": 4.570765470019125e-06, "epoch": 1.8983957219251337, "percentage": 18.98, "elapsed_time": "1:56:54", "remaining_time": "8:18:55"} +{"current_steps": 356, "total_steps": 1870, "loss": 0.6079, "lr": 4.5684093964429906e-06, "epoch": 1.9037433155080214, "percentage": 19.04, "elapsed_time": "1:56:58", "remaining_time": "8:17:26"} +{"current_steps": 357, "total_steps": 1870, "loss": 0.4644, "lr": 4.566047485016853e-06, "epoch": 1.9090909090909092, "percentage": 19.09, "elapsed_time": "1:57:01", "remaining_time": "8:15:59"} +{"current_steps": 358, "total_steps": 1870, "loss": 0.721, "lr": 4.563679742406935e-06, "epoch": 1.914438502673797, "percentage": 19.14, "elapsed_time": "1:57:03", "remaining_time": "8:14:22"} +{"current_steps": 359, "total_steps": 1870, "loss": 1.0296, "lr": 4.5613061752959236e-06, "epoch": 1.9197860962566846, "percentage": 19.2, "elapsed_time": "1:57:06", "remaining_time": "8:12:53"} +{"current_steps": 360, "total_steps": 1870, "loss": 0.892, "lr": 4.558926790382941e-06, "epoch": 1.9251336898395723, "percentage": 19.25, "elapsed_time": "1:57:09", "remaining_time": "8:11:24"} +{"current_steps": 361, "total_steps": 1870, "loss": 0.6153, "lr": 4.556541594383528e-06, "epoch": 1.93048128342246, "percentage": 19.3, "elapsed_time": "1:57:12", "remaining_time": "8:09:56"} +{"current_steps": 362, "total_steps": 1870, "loss": 0.3246, "lr": 4.554150594029631e-06, "epoch": 1.9358288770053476, "percentage": 19.36, "elapsed_time": "1:57:14", "remaining_time": "8:08:23"} +{"current_steps": 363, "total_steps": 1870, "loss": 0.5986, "lr": 4.551753796069577e-06, "epoch": 1.9411764705882353, "percentage": 19.41, "elapsed_time": "1:57:17", "remaining_time": "8:06:57"} +{"current_steps": 364, "total_steps": 1870, "loss": 0.5642, "lr": 4.5493512072680535e-06, "epoch": 1.946524064171123, "percentage": 19.47, "elapsed_time": "1:57:22", "remaining_time": "8:05:37"} +{"current_steps": 365, "total_steps": 1870, "loss": 0.7661, "lr": 4.546942834406094e-06, "epoch": 1.9518716577540107, "percentage": 19.52, "elapsed_time": "1:57:25", "remaining_time": "8:04:10"} +{"current_steps": 366, "total_steps": 1870, "loss": 0.4739, "lr": 4.544528684281056e-06, "epoch": 1.9572192513368984, "percentage": 19.57, "elapsed_time": "1:57:26", "remaining_time": "8:02:36"} +{"current_steps": 367, "total_steps": 1870, "loss": 0.4551, "lr": 4.5421087637066065e-06, "epoch": 1.962566844919786, "percentage": 19.63, "elapsed_time": "1:57:30", "remaining_time": "8:01:13"} +{"current_steps": 368, "total_steps": 1870, "loss": 0.7336, "lr": 4.539683079512692e-06, "epoch": 1.9679144385026737, "percentage": 19.68, "elapsed_time": "1:57:33", "remaining_time": "7:59:50"} +{"current_steps": 369, "total_steps": 1870, "loss": 0.5833, "lr": 4.537251638545532e-06, "epoch": 1.9732620320855614, "percentage": 19.73, "elapsed_time": "1:57:36", "remaining_time": "7:58:24"} +{"current_steps": 370, "total_steps": 1870, "loss": 0.3305, "lr": 4.534814447667591e-06, "epoch": 1.9786096256684491, "percentage": 19.79, "elapsed_time": "1:57:38", "remaining_time": "7:56:55"} +{"current_steps": 371, "total_steps": 1870, "loss": 0.4912, "lr": 4.532371513757564e-06, "epoch": 1.9839572192513368, "percentage": 19.84, "elapsed_time": "1:57:42", "remaining_time": "7:55:35"} +{"current_steps": 372, "total_steps": 1870, "loss": 0.611, "lr": 4.529922843710354e-06, "epoch": 1.9893048128342246, "percentage": 19.89, "elapsed_time": "1:57:46", "remaining_time": "7:54:14"} +{"current_steps": 373, "total_steps": 1870, "loss": 0.6487, "lr": 4.52746844443705e-06, "epoch": 1.9946524064171123, "percentage": 19.95, "elapsed_time": "1:57:52", "remaining_time": "7:53:04"} +{"current_steps": 374, "total_steps": 1870, "loss": 0.607, "lr": 4.525008322864917e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:57:55", "remaining_time": "7:51:42"} +{"current_steps": 375, "total_steps": 1870, "loss": 0.4111, "lr": 4.522542485937369e-06, "epoch": 2.0053475935828877, "percentage": 20.05, "elapsed_time": "2:52:28", "remaining_time": "11:27:37"} +{"current_steps": 376, "total_steps": 1870, "loss": 0.3418, "lr": 4.520070940613948e-06, "epoch": 2.0106951871657754, "percentage": 20.11, "elapsed_time": "2:52:32", "remaining_time": "11:25:36"} +{"current_steps": 377, "total_steps": 1870, "loss": 0.3853, "lr": 4.51759369387031e-06, "epoch": 2.016042780748663, "percentage": 20.16, "elapsed_time": "2:52:36", "remaining_time": "11:23:33"} +{"current_steps": 378, "total_steps": 1870, "loss": 0.4586, "lr": 4.515110752698203e-06, "epoch": 2.021390374331551, "percentage": 20.21, "elapsed_time": "2:52:39", "remaining_time": "11:21:30"} +{"current_steps": 379, "total_steps": 1870, "loss": 0.5457, "lr": 4.512622124105444e-06, "epoch": 2.0267379679144386, "percentage": 20.27, "elapsed_time": "2:52:42", "remaining_time": "11:19:28"} +{"current_steps": 380, "total_steps": 1870, "loss": 0.5248, "lr": 4.510127815115904e-06, "epoch": 2.0320855614973263, "percentage": 20.32, "elapsed_time": "2:52:46", "remaining_time": "11:17:28"} +{"current_steps": 381, "total_steps": 1870, "loss": 0.3257, "lr": 4.507627832769486e-06, "epoch": 2.037433155080214, "percentage": 20.37, "elapsed_time": "2:52:49", "remaining_time": "11:15:23"} +{"current_steps": 382, "total_steps": 1870, "loss": 0.6607, "lr": 4.505122184122107e-06, "epoch": 2.0427807486631018, "percentage": 20.43, "elapsed_time": "2:52:51", "remaining_time": "11:13:18"} +{"current_steps": 383, "total_steps": 1870, "loss": 0.285, "lr": 4.502610876245674e-06, "epoch": 2.0481283422459895, "percentage": 20.48, "elapsed_time": "2:52:55", "remaining_time": "11:11:21"} +{"current_steps": 384, "total_steps": 1870, "loss": 0.2656, "lr": 4.500093916228068e-06, "epoch": 2.053475935828877, "percentage": 20.53, "elapsed_time": "2:52:57", "remaining_time": "11:09:18"} +{"current_steps": 385, "total_steps": 1870, "loss": 0.7756, "lr": 4.4975713111731206e-06, "epoch": 2.0588235294117645, "percentage": 20.59, "elapsed_time": "2:52:59", "remaining_time": "11:07:13"} +{"current_steps": 386, "total_steps": 1870, "loss": 0.2841, "lr": 4.4950430682005995e-06, "epoch": 2.064171122994652, "percentage": 20.64, "elapsed_time": "2:53:02", "remaining_time": "11:05:17"} +{"current_steps": 387, "total_steps": 1870, "loss": 0.375, "lr": 4.49250919444618e-06, "epoch": 2.06951871657754, "percentage": 20.7, "elapsed_time": "2:53:06", "remaining_time": "11:03:21"} +{"current_steps": 388, "total_steps": 1870, "loss": 0.4506, "lr": 4.489969697061436e-06, "epoch": 2.0748663101604277, "percentage": 20.75, "elapsed_time": "2:53:08", "remaining_time": "11:01:21"} +{"current_steps": 389, "total_steps": 1870, "loss": 0.3308, "lr": 4.487424583213807e-06, "epoch": 2.0802139037433154, "percentage": 20.8, "elapsed_time": "2:53:11", "remaining_time": "10:59:21"} +{"current_steps": 390, "total_steps": 1870, "loss": 0.3271, "lr": 4.484873860086586e-06, "epoch": 2.085561497326203, "percentage": 20.86, "elapsed_time": "2:53:14", "remaining_time": "10:57:26"} +{"current_steps": 391, "total_steps": 1870, "loss": 0.22, "lr": 4.482317534878901e-06, "epoch": 2.090909090909091, "percentage": 20.91, "elapsed_time": "2:53:16", "remaining_time": "10:55:24"} +{"current_steps": 392, "total_steps": 1870, "loss": 0.4323, "lr": 4.4797556148056884e-06, "epoch": 2.0962566844919786, "percentage": 20.96, "elapsed_time": "2:53:20", "remaining_time": "10:53:33"} +{"current_steps": 393, "total_steps": 1870, "loss": 0.5702, "lr": 4.477188107097675e-06, "epoch": 2.1016042780748663, "percentage": 21.02, "elapsed_time": "2:53:22", "remaining_time": "10:51:35"} +{"current_steps": 394, "total_steps": 1870, "loss": 0.2821, "lr": 4.474615019001359e-06, "epoch": 2.106951871657754, "percentage": 21.07, "elapsed_time": "2:53:27", "remaining_time": "10:49:47"} +{"current_steps": 395, "total_steps": 1870, "loss": 0.6381, "lr": 4.47203635777899e-06, "epoch": 2.1122994652406417, "percentage": 21.12, "elapsed_time": "2:53:30", "remaining_time": "10:47:54"} +{"current_steps": 396, "total_steps": 1870, "loss": 0.5999, "lr": 4.469452130708544e-06, "epoch": 2.1176470588235294, "percentage": 21.18, "elapsed_time": "2:53:32", "remaining_time": "10:45:57"} +{"current_steps": 397, "total_steps": 1870, "loss": 0.2416, "lr": 4.4668623450837085e-06, "epoch": 2.122994652406417, "percentage": 21.23, "elapsed_time": "2:53:33", "remaining_time": "10:43:59"} +{"current_steps": 398, "total_steps": 1870, "loss": 0.6149, "lr": 4.464267008213858e-06, "epoch": 2.128342245989305, "percentage": 21.28, "elapsed_time": "2:53:36", "remaining_time": "10:42:05"} +{"current_steps": 399, "total_steps": 1870, "loss": 0.4445, "lr": 4.461666127424036e-06, "epoch": 2.1336898395721926, "percentage": 21.34, "elapsed_time": "2:53:38", "remaining_time": "10:40:09"} +{"current_steps": 400, "total_steps": 1870, "loss": 0.4432, "lr": 4.459059710054933e-06, "epoch": 2.1390374331550803, "percentage": 21.39, "elapsed_time": "2:53:42", "remaining_time": "10:38:22"} +{"current_steps": 401, "total_steps": 1870, "loss": 0.6437, "lr": 4.456447763462863e-06, "epoch": 2.144385026737968, "percentage": 21.44, "elapsed_time": "2:53:44", "remaining_time": "10:36:27"} +{"current_steps": 402, "total_steps": 1870, "loss": 0.3432, "lr": 4.453830295019749e-06, "epoch": 2.1497326203208558, "percentage": 21.5, "elapsed_time": "2:53:46", "remaining_time": "10:34:34"} +{"current_steps": 403, "total_steps": 1870, "loss": 0.5478, "lr": 4.4512073121130985e-06, "epoch": 2.1550802139037435, "percentage": 21.55, "elapsed_time": "2:53:53", "remaining_time": "10:33:00"} +{"current_steps": 404, "total_steps": 1870, "loss": 0.2706, "lr": 4.448578822145982e-06, "epoch": 2.160427807486631, "percentage": 21.6, "elapsed_time": "2:53:54", "remaining_time": "10:31:04"} +{"current_steps": 405, "total_steps": 1870, "loss": 0.2337, "lr": 4.445944832537011e-06, "epoch": 2.165775401069519, "percentage": 21.66, "elapsed_time": "2:53:56", "remaining_time": "10:29:12"} +{"current_steps": 406, "total_steps": 1870, "loss": 0.551, "lr": 4.443305350720324e-06, "epoch": 2.171122994652406, "percentage": 21.71, "elapsed_time": "2:53:58", "remaining_time": "10:27:19"} +{"current_steps": 407, "total_steps": 1870, "loss": 0.4896, "lr": 4.440660384145557e-06, "epoch": 2.176470588235294, "percentage": 21.76, "elapsed_time": "2:54:01", "remaining_time": "10:25:31"} +{"current_steps": 408, "total_steps": 1870, "loss": 0.4094, "lr": 4.438009940277825e-06, "epoch": 2.1818181818181817, "percentage": 21.82, "elapsed_time": "2:54:06", "remaining_time": "10:23:51"} +{"current_steps": 409, "total_steps": 1870, "loss": 0.2985, "lr": 4.435354026597707e-06, "epoch": 2.1871657754010694, "percentage": 21.87, "elapsed_time": "2:54:07", "remaining_time": "10:21:59"} +{"current_steps": 410, "total_steps": 1870, "loss": 0.6087, "lr": 4.432692650601215e-06, "epoch": 2.192513368983957, "percentage": 21.93, "elapsed_time": "2:54:09", "remaining_time": "10:20:10"} +{"current_steps": 411, "total_steps": 1870, "loss": 0.2324, "lr": 4.43002581979978e-06, "epoch": 2.197860962566845, "percentage": 21.98, "elapsed_time": "2:54:11", "remaining_time": "10:18:19"} +{"current_steps": 412, "total_steps": 1870, "loss": 0.2497, "lr": 4.42735354172023e-06, "epoch": 2.2032085561497325, "percentage": 22.03, "elapsed_time": "2:54:12", "remaining_time": "10:16:30"} +{"current_steps": 413, "total_steps": 1870, "loss": 0.5508, "lr": 4.4246758239047636e-06, "epoch": 2.2085561497326203, "percentage": 22.09, "elapsed_time": "2:54:16", "remaining_time": "10:14:49"} +{"current_steps": 414, "total_steps": 1870, "loss": 0.248, "lr": 4.421992673910934e-06, "epoch": 2.213903743315508, "percentage": 22.14, "elapsed_time": "2:54:17", "remaining_time": "10:12:59"} +{"current_steps": 415, "total_steps": 1870, "loss": 0.3047, "lr": 4.4193040993116284e-06, "epoch": 2.2192513368983957, "percentage": 22.19, "elapsed_time": "2:54:20", "remaining_time": "10:11:13"} +{"current_steps": 416, "total_steps": 1870, "loss": 0.2304, "lr": 4.416610107695043e-06, "epoch": 2.2245989304812834, "percentage": 22.25, "elapsed_time": "2:54:23", "remaining_time": "10:09:33"} +{"current_steps": 417, "total_steps": 1870, "loss": 0.4514, "lr": 4.413910706664659e-06, "epoch": 2.229946524064171, "percentage": 22.3, "elapsed_time": "2:54:29", "remaining_time": "10:08:01"} +{"current_steps": 418, "total_steps": 1870, "loss": 0.626, "lr": 4.411205903839232e-06, "epoch": 2.235294117647059, "percentage": 22.35, "elapsed_time": "2:54:31", "remaining_time": "10:06:15"} +{"current_steps": 419, "total_steps": 1870, "loss": 0.4973, "lr": 4.408495706852758e-06, "epoch": 2.2406417112299466, "percentage": 22.41, "elapsed_time": "2:54:34", "remaining_time": "10:04:33"} +{"current_steps": 420, "total_steps": 1870, "loss": 0.6283, "lr": 4.40578012335446e-06, "epoch": 2.2459893048128343, "percentage": 22.46, "elapsed_time": "2:54:36", "remaining_time": "10:02:48"} +{"current_steps": 421, "total_steps": 1870, "loss": 0.4237, "lr": 4.403059161008762e-06, "epoch": 2.251336898395722, "percentage": 22.51, "elapsed_time": "2:54:39", "remaining_time": "10:01:07"} +{"current_steps": 422, "total_steps": 1870, "loss": 0.4727, "lr": 4.4003328274952735e-06, "epoch": 2.2566844919786098, "percentage": 22.57, "elapsed_time": "2:54:40", "remaining_time": "9:59:21"} +{"current_steps": 423, "total_steps": 1870, "loss": 0.488, "lr": 4.397601130508757e-06, "epoch": 2.2620320855614975, "percentage": 22.62, "elapsed_time": "2:54:43", "remaining_time": "9:57:41"} +{"current_steps": 424, "total_steps": 1870, "loss": 0.4013, "lr": 4.394864077759119e-06, "epoch": 2.267379679144385, "percentage": 22.67, "elapsed_time": "2:54:45", "remaining_time": "9:55:58"} +{"current_steps": 425, "total_steps": 1870, "loss": 0.338, "lr": 4.392121676971377e-06, "epoch": 2.2727272727272725, "percentage": 22.73, "elapsed_time": "2:54:48", "remaining_time": "9:54:20"} +{"current_steps": 426, "total_steps": 1870, "loss": 0.647, "lr": 4.3893739358856465e-06, "epoch": 2.2780748663101607, "percentage": 22.78, "elapsed_time": "2:54:50", "remaining_time": "9:52:40"} +{"current_steps": 427, "total_steps": 1870, "loss": 0.5964, "lr": 4.386620862257113e-06, "epoch": 2.283422459893048, "percentage": 22.83, "elapsed_time": "2:54:52", "remaining_time": "9:50:59"} +{"current_steps": 428, "total_steps": 1870, "loss": 0.4262, "lr": 4.383862463856013e-06, "epoch": 2.2887700534759357, "percentage": 22.89, "elapsed_time": "2:54:54", "remaining_time": "9:49:17"} +{"current_steps": 429, "total_steps": 1870, "loss": 0.5824, "lr": 4.3810987484676126e-06, "epoch": 2.2941176470588234, "percentage": 22.94, "elapsed_time": "2:55:00", "remaining_time": "9:47:51"} +{"current_steps": 430, "total_steps": 1870, "loss": 0.4735, "lr": 4.378329723892184e-06, "epoch": 2.299465240641711, "percentage": 22.99, "elapsed_time": "2:55:02", "remaining_time": "9:46:09"} +{"current_steps": 431, "total_steps": 1870, "loss": 0.2989, "lr": 4.375555397944983e-06, "epoch": 2.304812834224599, "percentage": 23.05, "elapsed_time": "2:55:05", "remaining_time": "9:44:35"} +{"current_steps": 432, "total_steps": 1870, "loss": 0.3582, "lr": 4.37277577845623e-06, "epoch": 2.3101604278074865, "percentage": 23.1, "elapsed_time": "2:55:12", "remaining_time": "9:43:14"} +{"current_steps": 433, "total_steps": 1870, "loss": 0.3717, "lr": 4.369990873271082e-06, "epoch": 2.3155080213903743, "percentage": 23.16, "elapsed_time": "2:55:13", "remaining_time": "9:41:32"} +{"current_steps": 434, "total_steps": 1870, "loss": 0.3929, "lr": 4.36720069024962e-06, "epoch": 2.320855614973262, "percentage": 23.21, "elapsed_time": "2:55:15", "remaining_time": "9:39:53"} +{"current_steps": 435, "total_steps": 1870, "loss": 0.2747, "lr": 4.364405237266816e-06, "epoch": 2.3262032085561497, "percentage": 23.26, "elapsed_time": "2:55:18", "remaining_time": "9:38:19"} +{"current_steps": 436, "total_steps": 1870, "loss": 0.3981, "lr": 4.361604522212517e-06, "epoch": 2.3315508021390374, "percentage": 23.32, "elapsed_time": "2:55:21", "remaining_time": "9:36:45"} +{"current_steps": 437, "total_steps": 1870, "loss": 0.5474, "lr": 4.358798552991424e-06, "epoch": 2.336898395721925, "percentage": 23.37, "elapsed_time": "2:55:24", "remaining_time": "9:35:13"} +{"current_steps": 438, "total_steps": 1870, "loss": 0.3569, "lr": 4.355987337523065e-06, "epoch": 2.342245989304813, "percentage": 23.42, "elapsed_time": "2:55:28", "remaining_time": "9:33:42"} +{"current_steps": 439, "total_steps": 1870, "loss": 0.3298, "lr": 4.353170883741776e-06, "epoch": 2.3475935828877006, "percentage": 23.48, "elapsed_time": "2:55:33", "remaining_time": "9:32:15"} +{"current_steps": 440, "total_steps": 1870, "loss": 0.3982, "lr": 4.350349199596676e-06, "epoch": 2.3529411764705883, "percentage": 23.53, "elapsed_time": "2:55:37", "remaining_time": "9:30:47"} +{"current_steps": 441, "total_steps": 1870, "loss": 0.6064, "lr": 4.3475222930516484e-06, "epoch": 2.358288770053476, "percentage": 23.58, "elapsed_time": "2:55:43", "remaining_time": "9:29:25"} +{"current_steps": 442, "total_steps": 1870, "loss": 0.4624, "lr": 4.3446901720853144e-06, "epoch": 2.3636363636363638, "percentage": 23.64, "elapsed_time": "2:55:47", "remaining_time": "9:27:57"} +{"current_steps": 443, "total_steps": 1870, "loss": 0.4464, "lr": 4.341852844691012e-06, "epoch": 2.3689839572192515, "percentage": 23.69, "elapsed_time": "2:55:49", "remaining_time": "9:26:22"} +{"current_steps": 444, "total_steps": 1870, "loss": 0.4206, "lr": 4.339010318876777e-06, "epoch": 2.374331550802139, "percentage": 23.74, "elapsed_time": "2:55:54", "remaining_time": "9:24:57"} +{"current_steps": 445, "total_steps": 1870, "loss": 0.5911, "lr": 4.336162602665314e-06, "epoch": 2.379679144385027, "percentage": 23.8, "elapsed_time": "2:56:01", "remaining_time": "9:23:40"} +{"current_steps": 446, "total_steps": 1870, "loss": 0.4042, "lr": 4.333309704093977e-06, "epoch": 2.385026737967914, "percentage": 23.85, "elapsed_time": "2:56:05", "remaining_time": "9:22:14"} +{"current_steps": 447, "total_steps": 1870, "loss": 0.202, "lr": 4.330451631214747e-06, "epoch": 2.3903743315508024, "percentage": 23.9, "elapsed_time": "2:56:08", "remaining_time": "9:20:42"} +{"current_steps": 448, "total_steps": 1870, "loss": 0.3801, "lr": 4.3275883920942105e-06, "epoch": 2.3957219251336896, "percentage": 23.96, "elapsed_time": "2:56:09", "remaining_time": "9:19:10"} +{"current_steps": 449, "total_steps": 1870, "loss": 0.4677, "lr": 4.324719994813533e-06, "epoch": 2.4010695187165774, "percentage": 24.01, "elapsed_time": "2:56:16", "remaining_time": "9:17:52"} +{"current_steps": 450, "total_steps": 1870, "loss": 0.4263, "lr": 4.321846447468441e-06, "epoch": 2.406417112299465, "percentage": 24.06, "elapsed_time": "2:56:18", "remaining_time": "9:16:21"} +{"current_steps": 451, "total_steps": 1870, "loss": 0.6155, "lr": 4.318967758169192e-06, "epoch": 2.411764705882353, "percentage": 24.12, "elapsed_time": "2:56:22", "remaining_time": "9:14:56"} +{"current_steps": 452, "total_steps": 1870, "loss": 0.6256, "lr": 4.316083935040561e-06, "epoch": 2.4171122994652405, "percentage": 24.17, "elapsed_time": "2:56:25", "remaining_time": "9:13:27"} +{"current_steps": 453, "total_steps": 1870, "loss": 0.3739, "lr": 4.313194986221809e-06, "epoch": 2.4224598930481283, "percentage": 24.22, "elapsed_time": "2:56:29", "remaining_time": "9:12:03"} +{"current_steps": 454, "total_steps": 1870, "loss": 0.3881, "lr": 4.310300919866666e-06, "epoch": 2.427807486631016, "percentage": 24.28, "elapsed_time": "2:56:35", "remaining_time": "9:10:45"} +{"current_steps": 455, "total_steps": 1870, "loss": 0.6139, "lr": 4.307401744143304e-06, "epoch": 2.4331550802139037, "percentage": 24.33, "elapsed_time": "2:56:38", "remaining_time": "9:09:19"} +{"current_steps": 456, "total_steps": 1870, "loss": 0.2351, "lr": 4.304497467234317e-06, "epoch": 2.4385026737967914, "percentage": 24.39, "elapsed_time": "2:56:39", "remaining_time": "9:07:46"} +{"current_steps": 457, "total_steps": 1870, "loss": 0.4977, "lr": 4.3015880973366955e-06, "epoch": 2.443850267379679, "percentage": 24.44, "elapsed_time": "2:56:40", "remaining_time": "9:06:15"} +{"current_steps": 458, "total_steps": 1870, "loss": 0.5975, "lr": 4.2986736426618045e-06, "epoch": 2.449197860962567, "percentage": 24.49, "elapsed_time": "2:56:42", "remaining_time": "9:04:47"} +{"current_steps": 459, "total_steps": 1870, "loss": 0.2682, "lr": 4.295754111435361e-06, "epoch": 2.4545454545454546, "percentage": 24.55, "elapsed_time": "2:56:44", "remaining_time": "9:03:18"} +{"current_steps": 460, "total_steps": 1870, "loss": 0.4815, "lr": 4.292829511897409e-06, "epoch": 2.4598930481283423, "percentage": 24.6, "elapsed_time": "2:56:50", "remaining_time": "9:02:03"} +{"current_steps": 461, "total_steps": 1870, "loss": 0.4931, "lr": 4.2898998523022985e-06, "epoch": 2.46524064171123, "percentage": 24.65, "elapsed_time": "2:56:52", "remaining_time": "9:00:35"} +{"current_steps": 462, "total_steps": 1870, "loss": 0.3389, "lr": 4.28696514091866e-06, "epoch": 2.4705882352941178, "percentage": 24.71, "elapsed_time": "2:56:53", "remaining_time": "8:59:05"} +{"current_steps": 463, "total_steps": 1870, "loss": 0.6479, "lr": 4.284025386029381e-06, "epoch": 2.4759358288770055, "percentage": 24.76, "elapsed_time": "2:56:55", "remaining_time": "8:57:38"} +{"current_steps": 464, "total_steps": 1870, "loss": 0.4811, "lr": 4.281080595931587e-06, "epoch": 2.481283422459893, "percentage": 24.81, "elapsed_time": "2:56:58", "remaining_time": "8:56:15"} +{"current_steps": 465, "total_steps": 1870, "loss": 0.7251, "lr": 4.27813077893661e-06, "epoch": 2.486631016042781, "percentage": 24.87, "elapsed_time": "2:57:02", "remaining_time": "8:54:56"} +{"current_steps": 466, "total_steps": 1870, "loss": 0.4036, "lr": 4.2751759433699745e-06, "epoch": 2.4919786096256686, "percentage": 24.92, "elapsed_time": "2:57:04", "remaining_time": "8:53:28"} +{"current_steps": 467, "total_steps": 1870, "loss": 0.6547, "lr": 4.2722160975713675e-06, "epoch": 2.497326203208556, "percentage": 24.97, "elapsed_time": "2:57:07", "remaining_time": "8:52:07"} +{"current_steps": 468, "total_steps": 1870, "loss": 0.5507, "lr": 4.269251249894617e-06, "epoch": 2.502673796791444, "percentage": 25.03, "elapsed_time": "2:57:09", "remaining_time": "8:50:44"} +{"current_steps": 469, "total_steps": 1870, "loss": 0.4882, "lr": 4.266281408707667e-06, "epoch": 2.5080213903743314, "percentage": 25.08, "elapsed_time": "2:57:15", "remaining_time": "8:49:31"} +{"current_steps": 470, "total_steps": 1870, "loss": 0.2273, "lr": 4.263306582392556e-06, "epoch": 2.5133689839572195, "percentage": 25.13, "elapsed_time": "2:57:16", "remaining_time": "8:48:04"} +{"current_steps": 471, "total_steps": 1870, "loss": 0.6084, "lr": 4.2603267793453925e-06, "epoch": 2.518716577540107, "percentage": 25.19, "elapsed_time": "2:57:18", "remaining_time": "8:46:39"} +{"current_steps": 472, "total_steps": 1870, "loss": 0.283, "lr": 4.257342007976332e-06, "epoch": 2.5240641711229945, "percentage": 25.24, "elapsed_time": "2:57:22", "remaining_time": "8:45:22"} +{"current_steps": 473, "total_steps": 1870, "loss": 0.2611, "lr": 4.254352276709552e-06, "epoch": 2.5294117647058822, "percentage": 25.29, "elapsed_time": "2:57:24", "remaining_time": "8:43:57"} +{"current_steps": 474, "total_steps": 1870, "loss": 0.7555, "lr": 4.251357593983228e-06, "epoch": 2.53475935828877, "percentage": 25.35, "elapsed_time": "2:57:29", "remaining_time": "8:42:45"} +{"current_steps": 475, "total_steps": 1870, "loss": 0.2796, "lr": 4.24835796824951e-06, "epoch": 2.5401069518716577, "percentage": 25.4, "elapsed_time": "2:57:31", "remaining_time": "8:41:22"} +{"current_steps": 476, "total_steps": 1870, "loss": 0.8278, "lr": 4.245353407974503e-06, "epoch": 2.5454545454545454, "percentage": 25.45, "elapsed_time": "2:57:35", "remaining_time": "8:40:04"} +{"current_steps": 477, "total_steps": 1870, "loss": 0.2478, "lr": 4.242343921638235e-06, "epoch": 2.550802139037433, "percentage": 25.51, "elapsed_time": "2:57:37", "remaining_time": "8:38:42"} +{"current_steps": 478, "total_steps": 1870, "loss": 0.5353, "lr": 4.239329517734636e-06, "epoch": 2.556149732620321, "percentage": 25.56, "elapsed_time": "2:57:40", "remaining_time": "8:37:25"} +{"current_steps": 479, "total_steps": 1870, "loss": 0.3595, "lr": 4.2363102047715205e-06, "epoch": 2.5614973262032086, "percentage": 25.61, "elapsed_time": "2:57:43", "remaining_time": "8:36:05"} +{"current_steps": 480, "total_steps": 1870, "loss": 0.4868, "lr": 4.2332859912705545e-06, "epoch": 2.5668449197860963, "percentage": 25.67, "elapsed_time": "2:57:45", "remaining_time": "8:34:45"} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cd228e9c02575a49094195fb80a59a687dac0074 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json @@ -0,0 +1,10303 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 1467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 5.908512115478516, + "learning_rate": 5e-06, + "loss": 0.9606, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 4.304474353790283, + "learning_rate": 4.999995356617983e-06, + "loss": 0.8609, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 5.63697624206543, + "learning_rate": 4.999981426489179e-06, + "loss": 1.3543, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 3.6674246788024902, + "learning_rate": 4.999958209665336e-06, + "loss": 0.787, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 48.14854431152344, + "learning_rate": 4.999925706232695e-06, + "loss": 1.7786, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 7.8689866065979, + "learning_rate": 4.999883916312e-06, + "loss": 1.2175, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 5.119968891143799, + "learning_rate": 4.9998328400584864e-06, + "loss": 0.8998, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 3.730757713317871, + "learning_rate": 4.999772477661888e-06, + "loss": 0.8419, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 27.314565658569336, + "learning_rate": 4.999702829346432e-06, + "loss": 1.7948, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.822697162628174, + "learning_rate": 4.999623895370843e-06, + "loss": 1.0461, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 4.71220588684082, + "learning_rate": 4.999535676028338e-06, + "loss": 1.0, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.2378087043762207, + "learning_rate": 4.999438171646624e-06, + "loss": 0.9475, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 3.475543737411499, + "learning_rate": 4.999331382587901e-06, + "loss": 0.8654, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 10.06365966796875, + "learning_rate": 4.999215309248861e-06, + "loss": 1.2042, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 3.785153865814209, + "learning_rate": 4.999089952060681e-06, + "loss": 0.8846, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 2.944488048553467, + "learning_rate": 4.998955311489025e-06, + "loss": 0.8805, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 39.89304733276367, + "learning_rate": 4.998811388034046e-06, + "loss": 1.5882, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 3.5883963108062744, + "learning_rate": 4.9986581822303746e-06, + "loss": 0.9222, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 6.972247123718262, + "learning_rate": 4.998495694647127e-06, + "loss": 1.4088, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 3.948991298675537, + "learning_rate": 4.998323925887895e-06, + "loss": 1.454, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 3.8690035343170166, + "learning_rate": 4.998142876590749e-06, + "loss": 0.6335, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 5.243765830993652, + "learning_rate": 4.997952547428236e-06, + "loss": 0.6725, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 3.5994043350219727, + "learning_rate": 4.997752939107372e-06, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 4.06965970993042, + "learning_rate": 4.997544052369642e-06, + "loss": 0.9683, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 3.3247246742248535, + "learning_rate": 4.997325887990999e-06, + "loss": 0.9414, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 5.811742782592773, + "learning_rate": 4.997098446781861e-06, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 2.661334753036499, + "learning_rate": 4.996861729587103e-06, + "loss": 0.7708, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.863943576812744, + "learning_rate": 4.996615737286061e-06, + "loss": 0.6995, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 20.376733779907227, + "learning_rate": 4.996360470792524e-06, + "loss": 1.2563, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 3.62265682220459, + "learning_rate": 4.996095931054731e-06, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 3.915076732635498, + "learning_rate": 4.9958221190553705e-06, + "loss": 0.9227, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 3.129855155944824, + "learning_rate": 4.995539035811572e-06, + "loss": 0.701, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 2.7532224655151367, + "learning_rate": 4.9952466823749076e-06, + "loss": 0.6491, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 2.8444128036499023, + "learning_rate": 4.9949450598313835e-06, + "loss": 0.8029, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.57743239402771, + "learning_rate": 4.994634169301439e-06, + "loss": 0.8785, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 3.280055284500122, + "learning_rate": 4.994314011939941e-06, + "loss": 1.034, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 2.455838680267334, + "learning_rate": 4.99398458893618e-06, + "loss": 0.8557, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 4.72681188583374, + "learning_rate": 4.993645901513865e-06, + "loss": 1.1904, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 3.0585641860961914, + "learning_rate": 4.993297950931121e-06, + "loss": 0.7668, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.4603540897369385, + "learning_rate": 4.9929407384804806e-06, + "loss": 0.8812, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.9702436923980713, + "learning_rate": 4.992574265488883e-06, + "loss": 0.8878, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 2.6973602771759033, + "learning_rate": 4.9921985333176694e-06, + "loss": 0.7251, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 2.5542335510253906, + "learning_rate": 4.991813543362572e-06, + "loss": 0.6638, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.7530782222747803, + "learning_rate": 4.991419297053716e-06, + "loss": 1.0725, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 2.6483025550842285, + "learning_rate": 4.991015795855611e-06, + "loss": 0.7238, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 3.434422492980957, + "learning_rate": 4.990603041267144e-06, + "loss": 0.9188, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.914340019226074, + "learning_rate": 4.990181034821578e-06, + "loss": 0.6158, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 2.7211625576019287, + "learning_rate": 4.98974977808654e-06, + "loss": 0.7165, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.8414249420166016, + "learning_rate": 4.989309272664026e-06, + "loss": 0.7277, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 3.683204412460327, + "learning_rate": 4.988859520190381e-06, + "loss": 0.9793, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 3.1732583045959473, + "learning_rate": 4.988400522336304e-06, + "loss": 0.8966, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 2.7789194583892822, + "learning_rate": 4.9879322808068365e-06, + "loss": 0.8191, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.754816770553589, + "learning_rate": 4.987454797341358e-06, + "loss": 0.6308, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.730104684829712, + "learning_rate": 4.98696807371358e-06, + "loss": 0.8226, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 3.2225449085235596, + "learning_rate": 4.986472111731536e-06, + "loss": 0.9184, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 3.2684760093688965, + "learning_rate": 4.985966913237581e-06, + "loss": 0.6593, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.43105411529541, + "learning_rate": 4.985452480108376e-06, + "loss": 0.6994, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 7.366360664367676, + "learning_rate": 4.984928814254889e-06, + "loss": 1.1374, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.81864333152771, + "learning_rate": 4.984395917622387e-06, + "loss": 0.8097, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 3.1107730865478516, + "learning_rate": 4.9838537921904206e-06, + "loss": 0.8511, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 2.460545301437378, + "learning_rate": 4.9833024399728295e-06, + "loss": 0.898, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.921992778778076, + "learning_rate": 4.982741863017722e-06, + "loss": 0.6671, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 3.3006443977355957, + "learning_rate": 4.982172063407479e-06, + "loss": 1.0559, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.642587661743164, + "learning_rate": 4.9815930432587365e-06, + "loss": 0.6663, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.905898094177246, + "learning_rate": 4.981004804722384e-06, + "loss": 0.6895, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.9174182415008545, + "learning_rate": 4.980407349983556e-06, + "loss": 0.7982, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 2.214322805404663, + "learning_rate": 4.979800681261619e-06, + "loss": 0.6808, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 2.7152462005615234, + "learning_rate": 4.9791848008101705e-06, + "loss": 0.567, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.5657734870910645, + "learning_rate": 4.978559710917024e-06, + "loss": 0.7745, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 3.9103832244873047, + "learning_rate": 4.977925413904205e-06, + "loss": 0.9815, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 4.610236644744873, + "learning_rate": 4.9772819121279395e-06, + "loss": 1.164, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 3.01170015335083, + "learning_rate": 4.976629207978648e-06, + "loss": 0.7587, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 3.175889253616333, + "learning_rate": 4.975967303880933e-06, + "loss": 0.58, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 2.503741502761841, + "learning_rate": 4.975296202293575e-06, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 2.6778078079223633, + "learning_rate": 4.974615905709518e-06, + "loss": 0.7352, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 5.950812816619873, + "learning_rate": 4.973926416655863e-06, + "loss": 1.0643, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 3.0165305137634277, + "learning_rate": 4.973227737693858e-06, + "loss": 0.6699, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 4.793259620666504, + "learning_rate": 4.972519871418894e-06, + "loss": 1.0315, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 3.632815361022949, + "learning_rate": 4.971802820460481e-06, + "loss": 0.7003, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 3.077507734298706, + "learning_rate": 4.971076587482254e-06, + "loss": 0.6776, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 3.3886241912841797, + "learning_rate": 4.970341175181957e-06, + "loss": 0.7422, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 2.71288800239563, + "learning_rate": 4.969596586291425e-06, + "loss": 0.7471, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.777920961380005, + "learning_rate": 4.968842823576592e-06, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 6.496985912322998, + "learning_rate": 4.968079889837461e-06, + "loss": 0.9965, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.6163430213928223, + "learning_rate": 4.967307787908108e-06, + "loss": 0.6833, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 3.244098663330078, + "learning_rate": 4.966526520656663e-06, + "loss": 0.8373, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.9027860164642334, + "learning_rate": 4.965736090985305e-06, + "loss": 0.8529, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 2.3786230087280273, + "learning_rate": 4.964936501830246e-06, + "loss": 0.6577, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 7.3099045753479, + "learning_rate": 4.964127756161727e-06, + "loss": 1.1184, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 3.068873167037964, + "learning_rate": 4.963309856983998e-06, + "loss": 0.7906, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 3.082547426223755, + "learning_rate": 4.9624828073353144e-06, + "loss": 0.8107, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.4586973190307617, + "learning_rate": 4.961646610287922e-06, + "loss": 0.7421, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 2.779277801513672, + "learning_rate": 4.960801268948047e-06, + "loss": 0.7134, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 3.2255213260650635, + "learning_rate": 4.959946786455882e-06, + "loss": 0.5875, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 2.783395528793335, + "learning_rate": 4.959083165985581e-06, + "loss": 0.6595, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 2.240114212036133, + "learning_rate": 4.958210410745237e-06, + "loss": 0.793, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.9399421215057373, + "learning_rate": 4.957328523976879e-06, + "loss": 0.5896, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 3.4449355602264404, + "learning_rate": 4.956437508956458e-06, + "loss": 0.8658, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 4.273710250854492, + "learning_rate": 4.9555373689938325e-06, + "loss": 0.8316, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.4222047328948975, + "learning_rate": 4.954628107432757e-06, + "loss": 1.0613, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 2.5318963527679443, + "learning_rate": 4.95370972765087e-06, + "loss": 0.7194, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 2.7852585315704346, + "learning_rate": 4.952782233059683e-06, + "loss": 0.5927, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 2.6532323360443115, + "learning_rate": 4.951845627104565e-06, + "loss": 0.8505, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 2.3213467597961426, + "learning_rate": 4.95089991326473e-06, + "loss": 0.8682, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 2.607992649078369, + "learning_rate": 4.9499450950532305e-06, + "loss": 0.8735, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 3.9820072650909424, + "learning_rate": 4.94898117601693e-06, + "loss": 1.0571, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 3.3878824710845947, + "learning_rate": 4.948008159736507e-06, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 2.6935670375823975, + "learning_rate": 4.94702604982643e-06, + "loss": 0.5968, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.78190016746521, + "learning_rate": 4.9460348499349485e-06, + "loss": 0.7504, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 2.973083972930908, + "learning_rate": 4.945034563744077e-06, + "loss": 0.6728, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 2.631803512573242, + "learning_rate": 4.944025194969586e-06, + "loss": 0.609, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 2.7443883419036865, + "learning_rate": 4.9430067473609825e-06, + "loss": 0.8713, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 2.543769121170044, + "learning_rate": 4.941979224701499e-06, + "loss": 0.8035, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 3.7799901962280273, + "learning_rate": 4.94094263080808e-06, + "loss": 0.9341, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 3.1234734058380127, + "learning_rate": 4.939896969531367e-06, + "loss": 1.1066, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 2.356036424636841, + "learning_rate": 4.938842244755683e-06, + "loss": 0.853, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 3.6231274604797363, + "learning_rate": 4.937778460399022e-06, + "loss": 0.9116, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 3.1277005672454834, + "learning_rate": 4.936705620413028e-06, + "loss": 0.5888, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.7338361740112305, + "learning_rate": 4.935623728782986e-06, + "loss": 0.592, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 2.748363733291626, + "learning_rate": 4.934532789527805e-06, + "loss": 0.8713, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 4.460031986236572, + "learning_rate": 4.933432806700004e-06, + "loss": 0.6791, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 2.392911911010742, + "learning_rate": 4.932323784385693e-06, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.7804384231567383, + "learning_rate": 4.931205726704566e-06, + "loss": 0.7547, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.7664780616760254, + "learning_rate": 4.930078637809878e-06, + "loss": 0.7849, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 2.592808723449707, + "learning_rate": 4.928942521888431e-06, + "loss": 0.7015, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 2.7080585956573486, + "learning_rate": 4.927797383160561e-06, + "loss": 1.0028, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.7941503524780273, + "learning_rate": 4.926643225880123e-06, + "loss": 0.602, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 3.2796623706817627, + "learning_rate": 4.925480054334471e-06, + "loss": 0.7473, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 2.7623610496520996, + "learning_rate": 4.924307872844444e-06, + "loss": 1.0573, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.6224453449249268, + "learning_rate": 4.923126685764351e-06, + "loss": 0.7399, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 17.736326217651367, + "learning_rate": 4.921936497481956e-06, + "loss": 0.9548, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.504213333129883, + "learning_rate": 4.920737312418456e-06, + "loss": 0.6748, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 3.617077350616455, + "learning_rate": 4.919529135028473e-06, + "loss": 0.8431, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.6559832096099854, + "learning_rate": 4.918311969800027e-06, + "loss": 0.7243, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 2.7539305686950684, + "learning_rate": 4.917085821254532e-06, + "loss": 0.7845, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 3.3587615489959717, + "learning_rate": 4.915850693946766e-06, + "loss": 0.4891, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 3.064354181289673, + "learning_rate": 4.914606592464865e-06, + "loss": 0.7917, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 3.2505199909210205, + "learning_rate": 4.9133535214303e-06, + "loss": 0.9681, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 3.8027830123901367, + "learning_rate": 4.91209148549786e-06, + "loss": 0.9275, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 2.4154372215270996, + "learning_rate": 4.910820489355637e-06, + "loss": 0.7259, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.892462968826294, + "learning_rate": 4.909540537725007e-06, + "loss": 0.6061, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 3.3398196697235107, + "learning_rate": 4.908251635360616e-06, + "loss": 1.0559, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 3.022512197494507, + "learning_rate": 4.906953787050354e-06, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 2.658661365509033, + "learning_rate": 4.905646997615347e-06, + "loss": 0.6234, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 3.454400062561035, + "learning_rate": 4.904331271909932e-06, + "loss": 0.8066, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 3.1300277709960938, + "learning_rate": 4.903006614821645e-06, + "loss": 0.6861, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 2.362537145614624, + "learning_rate": 4.901673031271194e-06, + "loss": 0.6112, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 3.375577688217163, + "learning_rate": 4.900330526212451e-06, + "loss": 0.6314, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 2.955656051635742, + "learning_rate": 4.898979104632427e-06, + "loss": 0.889, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 2.9285926818847656, + "learning_rate": 4.897618771551255e-06, + "loss": 0.6406, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.131819725036621, + "learning_rate": 4.8962495320221714e-06, + "loss": 0.6368, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 2.780649185180664, + "learning_rate": 4.8948713911315e-06, + "loss": 0.8642, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 2.941500186920166, + "learning_rate": 4.8934843539986266e-06, + "loss": 0.714, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.892088425775986e-06, + "loss": 0.8365, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 2.6887171268463135, + "learning_rate": 4.890683611649041e-06, + "loss": 0.7937, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 3.7638463973999023, + "learning_rate": 4.8892699168362626e-06, + "loss": 0.7485, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.8132755756378174, + "learning_rate": 4.887847346589111e-06, + "loss": 0.6467, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.652247190475464, + "learning_rate": 4.886415906192015e-06, + "loss": 0.4651, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 2.5854647159576416, + "learning_rate": 4.884975600962355e-06, + "loss": 0.8756, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 3.1630544662475586, + "learning_rate": 4.883526436250441e-06, + "loss": 0.7339, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.84452748298645, + "learning_rate": 4.8820684174394935e-06, + "loss": 0.7808, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 3.604048490524292, + "learning_rate": 4.880601549945622e-06, + "loss": 0.96, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 2.302924871444702, + "learning_rate": 4.879125839217808e-06, + "loss": 0.8122, + "step": 163 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 3.1254405975341797, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.7307, + "step": 164 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 2.745603322982788, + "learning_rate": 4.8761479100205085e-06, + "loss": 0.7554, + "step": 165 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 2.494840145111084, + "learning_rate": 4.874645702613152e-06, + "loss": 0.4372, + "step": 166 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 2.3526735305786133, + "learning_rate": 4.873134674096072e-06, + "loss": 0.3597, + "step": 167 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 2.945887804031372, + "learning_rate": 4.871614830082297e-06, + "loss": 0.5854, + "step": 168 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 3.5723934173583984, + "learning_rate": 4.870086176217597e-06, + "loss": 0.7978, + "step": 169 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 3.2997145652770996, + "learning_rate": 4.868548718180473e-06, + "loss": 0.5593, + "step": 170 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 3.4120635986328125, + "learning_rate": 4.867002461682129e-06, + "loss": 0.4083, + "step": 171 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 2.697617292404175, + "learning_rate": 4.8654474124664505e-06, + "loss": 0.4752, + "step": 172 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 5.082247734069824, + "learning_rate": 4.863883576309991e-06, + "loss": 0.7435, + "step": 173 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 2.773864984512329, + "learning_rate": 4.8623109590219395e-06, + "loss": 0.4612, + "step": 174 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 3.429703712463379, + "learning_rate": 4.860729566444106e-06, + "loss": 0.4644, + "step": 175 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 2.997938394546509, + "learning_rate": 4.8591394044508985e-06, + "loss": 0.4852, + "step": 176 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 2.549513339996338, + "learning_rate": 4.857540478949302e-06, + "loss": 0.4574, + "step": 177 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 3.459400177001953, + "learning_rate": 4.855932795878852e-06, + "loss": 0.8095, + "step": 178 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 2.8103644847869873, + "learning_rate": 4.854316361211619e-06, + "loss": 0.4578, + "step": 179 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 2.631221055984497, + "learning_rate": 4.852691180952183e-06, + "loss": 0.5473, + "step": 180 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 3.189946174621582, + "learning_rate": 4.851057261137608e-06, + "loss": 0.4313, + "step": 181 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 2.891418933868408, + "learning_rate": 4.8494146078374274e-06, + "loss": 0.4197, + "step": 182 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 3.239637613296509, + "learning_rate": 4.847763227153612e-06, + "loss": 0.5865, + "step": 183 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 2.484644651412964, + "learning_rate": 4.846103125220557e-06, + "loss": 0.3866, + "step": 184 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 3.1045992374420166, + "learning_rate": 4.844434308205052e-06, + "loss": 0.5357, + "step": 185 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 2.648472309112549, + "learning_rate": 4.842756782306261e-06, + "loss": 0.4783, + "step": 186 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 2.5685644149780273, + "learning_rate": 4.841070553755697e-06, + "loss": 0.3733, + "step": 187 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 3.7727200984954834, + "learning_rate": 4.839375628817205e-06, + "loss": 0.6039, + "step": 188 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 2.8237369060516357, + "learning_rate": 4.837672013786931e-06, + "loss": 0.5372, + "step": 189 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 3.0312252044677734, + "learning_rate": 4.835959714993305e-06, + "loss": 0.5162, + "step": 190 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 2.821498394012451, + "learning_rate": 4.8342387387970105e-06, + "loss": 0.4537, + "step": 191 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 2.7834129333496094, + "learning_rate": 4.832509091590968e-06, + "loss": 0.6165, + "step": 192 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 2.9274091720581055, + "learning_rate": 4.830770779800309e-06, + "loss": 0.7475, + "step": 193 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 2.813945770263672, + "learning_rate": 4.829023809882349e-06, + "loss": 0.4629, + "step": 194 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 2.27876877784729, + "learning_rate": 4.827268188326567e-06, + "loss": 0.5208, + "step": 195 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 2.8444204330444336, + "learning_rate": 4.825503921654582e-06, + "loss": 0.6521, + "step": 196 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 3.3730578422546387, + "learning_rate": 4.823731016420122e-06, + "loss": 0.7491, + "step": 197 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 2.9717822074890137, + "learning_rate": 4.821949479209011e-06, + "loss": 0.3866, + "step": 198 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 2.6570653915405273, + "learning_rate": 4.820159316639133e-06, + "loss": 0.499, + "step": 199 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 2.819960117340088, + "learning_rate": 4.818360535360418e-06, + "loss": 0.556, + "step": 200 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 2.7912111282348633, + "learning_rate": 4.816553142054806e-06, + "loss": 0.3433, + "step": 201 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 2.6427981853485107, + "learning_rate": 4.814737143436232e-06, + "loss": 0.8808, + "step": 202 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 2.5917580127716064, + "learning_rate": 4.812912546250595e-06, + "loss": 0.5718, + "step": 203 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 3.770759344100952, + "learning_rate": 4.81107935727574e-06, + "loss": 0.9743, + "step": 204 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 2.558248996734619, + "learning_rate": 4.809237583321421e-06, + "loss": 0.2821, + "step": 205 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 2.692087173461914, + "learning_rate": 4.807387231229287e-06, + "loss": 0.7524, + "step": 206 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 2.661738157272339, + "learning_rate": 4.8055283078728525e-06, + "loss": 0.4304, + "step": 207 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 2.9232122898101807, + "learning_rate": 4.803660820157468e-06, + "loss": 0.6986, + "step": 208 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 2.665097951889038, + "learning_rate": 4.801784775020303e-06, + "loss": 0.7112, + "step": 209 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 2.4504497051239014, + "learning_rate": 4.799900179430312e-06, + "loss": 0.4125, + "step": 210 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 3.076204538345337, + "learning_rate": 4.798007040388212e-06, + "loss": 0.7057, + "step": 211 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 2.406977653503418, + "learning_rate": 4.7961053649264585e-06, + "loss": 0.708, + "step": 212 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 2.6545324325561523, + "learning_rate": 4.794195160109215e-06, + "loss": 0.7608, + "step": 213 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 4.3817033767700195, + "learning_rate": 4.7922764330323315e-06, + "loss": 0.4779, + "step": 214 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 3.534566879272461, + "learning_rate": 4.790349190823313e-06, + "loss": 0.5464, + "step": 215 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 3.0323140621185303, + "learning_rate": 4.788413440641297e-06, + "loss": 0.6198, + "step": 216 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 2.612746238708496, + "learning_rate": 4.786469189677026e-06, + "loss": 0.6695, + "step": 217 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 3.0299434661865234, + "learning_rate": 4.784516445152821e-06, + "loss": 0.4902, + "step": 218 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 3.4521942138671875, + "learning_rate": 4.78255521432255e-06, + "loss": 0.7411, + "step": 219 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 2.6712653636932373, + "learning_rate": 4.780585504471612e-06, + "loss": 0.8767, + "step": 220 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 2.5099475383758545, + "learning_rate": 4.778607322916896e-06, + "loss": 0.4266, + "step": 221 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 2.641799211502075, + "learning_rate": 4.776620677006766e-06, + "loss": 0.4982, + "step": 222 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 3.1119771003723145, + "learning_rate": 4.7746255741210256e-06, + "loss": 0.6012, + "step": 223 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 3.9957170486450195, + "learning_rate": 4.772622021670897e-06, + "loss": 0.7585, + "step": 224 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 3.1070823669433594, + "learning_rate": 4.770610027098983e-06, + "loss": 0.5266, + "step": 225 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 2.7630460262298584, + "learning_rate": 4.7685895978792564e-06, + "loss": 0.6261, + "step": 226 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 2.6509556770324707, + "learning_rate": 4.766560741517014e-06, + "loss": 0.7081, + "step": 227 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 3.0212976932525635, + "learning_rate": 4.76452346554886e-06, + "loss": 0.5041, + "step": 228 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 3.0454728603363037, + "learning_rate": 4.762477777542676e-06, + "loss": 0.49, + "step": 229 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 3.4296791553497314, + "learning_rate": 4.7604236850975905e-06, + "loss": 0.7056, + "step": 230 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 4.1885600090026855, + "learning_rate": 4.7583611958439514e-06, + "loss": 0.7762, + "step": 231 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 3.065854072570801, + "learning_rate": 4.7562903174433e-06, + "loss": 0.5347, + "step": 232 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 2.793851852416992, + "learning_rate": 4.75421105758834e-06, + "loss": 0.503, + "step": 233 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 3.123730421066284, + "learning_rate": 4.752123424002908e-06, + "loss": 0.5081, + "step": 234 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 3.230161666870117, + "learning_rate": 4.750027424441949e-06, + "loss": 0.7523, + "step": 235 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 2.4970247745513916, + "learning_rate": 4.747923066691487e-06, + "loss": 0.5575, + "step": 236 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 2.9880685806274414, + "learning_rate": 4.745810358568588e-06, + "loss": 0.7264, + "step": 237 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 2.555328369140625, + "learning_rate": 4.743689307921342e-06, + "loss": 0.4545, + "step": 238 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 3.144932746887207, + "learning_rate": 4.741559922628828e-06, + "loss": 0.5429, + "step": 239 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 3.059807062149048, + "learning_rate": 4.739422210601085e-06, + "loss": 0.5086, + "step": 240 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 3.374303102493286, + "learning_rate": 4.7372761797790836e-06, + "loss": 0.6109, + "step": 241 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 2.4506947994232178, + "learning_rate": 4.735121838134697e-06, + "loss": 0.4317, + "step": 242 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 2.9039974212646484, + "learning_rate": 4.732959193670672e-06, + "loss": 0.6414, + "step": 243 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 2.9412453174591064, + "learning_rate": 4.730788254420593e-06, + "loss": 0.5166, + "step": 244 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 2.500716209411621, + "learning_rate": 4.728609028448862e-06, + "loss": 0.4982, + "step": 245 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 2.4233803749084473, + "learning_rate": 4.726421523850662e-06, + "loss": 0.7552, + "step": 246 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 2.357003688812256, + "learning_rate": 4.7242257487519275e-06, + "loss": 0.4365, + "step": 247 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 2.6406495571136475, + "learning_rate": 4.722021711309317e-06, + "loss": 0.6002, + "step": 248 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 2.736884832382202, + "learning_rate": 4.7198094197101826e-06, + "loss": 0.4993, + "step": 249 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 3.5238845348358154, + "learning_rate": 4.7175888821725335e-06, + "loss": 0.4637, + "step": 250 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 3.3783695697784424, + "learning_rate": 4.715360106945015e-06, + "loss": 0.9711, + "step": 251 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 2.9685862064361572, + "learning_rate": 4.713123102306869e-06, + "loss": 0.5452, + "step": 252 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 3.143733263015747, + "learning_rate": 4.710877876567912e-06, + "loss": 0.5034, + "step": 253 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 2.8005623817443848, + "learning_rate": 4.708624438068494e-06, + "loss": 0.4236, + "step": 254 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 2.66581130027771, + "learning_rate": 4.706362795179476e-06, + "loss": 0.6095, + "step": 255 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 4.598043441772461, + "learning_rate": 4.7040929563021975e-06, + "loss": 0.738, + "step": 256 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 3.5643506050109863, + "learning_rate": 4.70181492986844e-06, + "loss": 0.6726, + "step": 257 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 2.865339994430542, + "learning_rate": 4.699528724340401e-06, + "loss": 0.4862, + "step": 258 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 2.95529842376709, + "learning_rate": 4.6972343482106615e-06, + "loss": 0.5003, + "step": 259 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 2.45206356048584, + "learning_rate": 4.6949318100021546e-06, + "loss": 0.6734, + "step": 260 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 2.6789939403533936, + "learning_rate": 4.6926211182681295e-06, + "loss": 0.5639, + "step": 261 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 3.307732582092285, + "learning_rate": 4.690302281592128e-06, + "loss": 0.7032, + "step": 262 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 2.8950445652008057, + "learning_rate": 4.687975308587944e-06, + "loss": 0.4937, + "step": 263 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 2.969377040863037, + "learning_rate": 4.685640207899598e-06, + "loss": 0.5829, + "step": 264 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 3.106433391571045, + "learning_rate": 4.683296988201301e-06, + "loss": 0.3805, + "step": 265 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 3.5599050521850586, + "learning_rate": 4.680945658197425e-06, + "loss": 0.7939, + "step": 266 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 5.008603096008301, + "learning_rate": 4.6785862266224695e-06, + "loss": 0.7511, + "step": 267 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 3.1393773555755615, + "learning_rate": 4.676218702241026e-06, + "loss": 0.8984, + "step": 268 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 3.0241408348083496, + "learning_rate": 4.673843093847753e-06, + "loss": 0.5473, + "step": 269 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 2.9029417037963867, + "learning_rate": 4.6714594102673355e-06, + "loss": 0.6626, + "step": 270 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 3.4709246158599854, + "learning_rate": 4.669067660354456e-06, + "loss": 0.5015, + "step": 271 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 2.988635778427124, + "learning_rate": 4.666667852993761e-06, + "loss": 0.5384, + "step": 272 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 3.418140411376953, + "learning_rate": 4.664259997099829e-06, + "loss": 0.7491, + "step": 273 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 2.592416763305664, + "learning_rate": 4.661844101617135e-06, + "loss": 0.6451, + "step": 274 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 3.1174306869506836, + "learning_rate": 4.6594201755200205e-06, + "loss": 0.6299, + "step": 275 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 2.6569998264312744, + "learning_rate": 4.656988227812658e-06, + "loss": 0.4477, + "step": 276 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 3.5733959674835205, + "learning_rate": 4.654548267529015e-06, + "loss": 0.5473, + "step": 277 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 2.7240824699401855, + "learning_rate": 4.652100303732827e-06, + "loss": 0.496, + "step": 278 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 4.1965460777282715, + "learning_rate": 4.64964434551756e-06, + "loss": 0.932, + "step": 279 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 2.3237173557281494, + "learning_rate": 4.647180402006372e-06, + "loss": 0.4648, + "step": 280 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 3.395045042037964, + "learning_rate": 4.644708482352093e-06, + "loss": 0.7237, + "step": 281 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 3.238593816757202, + "learning_rate": 4.6422285957371735e-06, + "loss": 0.5531, + "step": 282 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 3.9651403427124023, + "learning_rate": 4.639740751373663e-06, + "loss": 0.6706, + "step": 283 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 3.0042061805725098, + "learning_rate": 4.63724495850317e-06, + "loss": 0.56, + "step": 284 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 3.094310760498047, + "learning_rate": 4.634741226396832e-06, + "loss": 0.6138, + "step": 285 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 2.838168144226074, + "learning_rate": 4.632229564355275e-06, + "loss": 0.4908, + "step": 286 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 3.3452796936035156, + "learning_rate": 4.629709981708586e-06, + "loss": 0.8181, + "step": 287 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 2.6630783081054688, + "learning_rate": 4.6271824878162704e-06, + "loss": 0.5625, + "step": 288 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 2.583650588989258, + "learning_rate": 4.624647092067226e-06, + "loss": 0.3416, + "step": 289 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 2.73132586479187, + "learning_rate": 4.622103803879702e-06, + "loss": 0.3889, + "step": 290 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 4.1010260581970215, + "learning_rate": 4.619552632701263e-06, + "loss": 0.611, + "step": 291 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 4.53068208694458, + "learning_rate": 4.61699358800876e-06, + "loss": 0.7219, + "step": 292 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 3.4877254962921143, + "learning_rate": 4.614426679308291e-06, + "loss": 0.6402, + "step": 293 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 2.9445226192474365, + "learning_rate": 4.611851916135166e-06, + "loss": 0.509, + "step": 294 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 2.6622228622436523, + "learning_rate": 4.609269308053872e-06, + "loss": 0.6167, + "step": 295 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 3.131530523300171, + "learning_rate": 4.606678864658039e-06, + "loss": 0.8039, + "step": 296 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 3.212188482284546, + "learning_rate": 4.604080595570399e-06, + "loss": 0.5754, + "step": 297 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 3.522850275039673, + "learning_rate": 4.601474510442759e-06, + "loss": 0.4432, + "step": 298 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 2.5877151489257812, + "learning_rate": 4.598860618955957e-06, + "loss": 0.6541, + "step": 299 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 2.803833484649658, + "learning_rate": 4.596238930819832e-06, + "loss": 0.5824, + "step": 300 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 2.7125494480133057, + "learning_rate": 4.5936094557731815e-06, + "loss": 0.6976, + "step": 301 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 3.6549370288848877, + "learning_rate": 4.590972203583732e-06, + "loss": 0.7105, + "step": 302 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 3.3241944313049316, + "learning_rate": 4.588327184048099e-06, + "loss": 0.7446, + "step": 303 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 2.8388822078704834, + "learning_rate": 4.585674406991752e-06, + "loss": 0.4926, + "step": 304 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 2.9760420322418213, + "learning_rate": 4.5830138822689755e-06, + "loss": 0.7368, + "step": 305 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 2.5437633991241455, + "learning_rate": 4.5803456197628374e-06, + "loss": 0.4678, + "step": 306 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 3.0044775009155273, + "learning_rate": 4.577669629385145e-06, + "loss": 0.4241, + "step": 307 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 2.6150901317596436, + "learning_rate": 4.574985921076418e-06, + "loss": 0.5327, + "step": 308 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 2.4425182342529297, + "learning_rate": 4.572294504805841e-06, + "loss": 0.7504, + "step": 309 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 2.9920194149017334, + "learning_rate": 4.569595390571232e-06, + "loss": 0.5194, + "step": 310 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 2.701087713241577, + "learning_rate": 4.566888588399007e-06, + "loss": 0.6862, + "step": 311 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 7.628893852233887, + "learning_rate": 4.564174108344139e-06, + "loss": 0.6867, + "step": 312 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 2.712947130203247, + "learning_rate": 4.561451960490123e-06, + "loss": 0.6942, + "step": 313 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 3.0063202381134033, + "learning_rate": 4.558722154948937e-06, + "loss": 0.6346, + "step": 314 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 2.957218647003174, + "learning_rate": 4.5559847018610034e-06, + "loss": 0.464, + "step": 315 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 3.322282552719116, + "learning_rate": 4.553239611395156e-06, + "loss": 0.6334, + "step": 316 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 3.0638647079467773, + "learning_rate": 4.550486893748596e-06, + "loss": 0.4227, + "step": 317 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 3.079087257385254, + "learning_rate": 4.547726559146862e-06, + "loss": 0.3719, + "step": 318 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 2.409914255142212, + "learning_rate": 4.544958617843782e-06, + "loss": 0.3331, + "step": 319 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 3.3441262245178223, + "learning_rate": 4.542183080121444e-06, + "loss": 0.6931, + "step": 320 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 2.6624436378479004, + "learning_rate": 4.539399956290152e-06, + "loss": 0.6578, + "step": 321 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 3.463789224624634, + "learning_rate": 4.536609256688396e-06, + "loss": 0.5748, + "step": 322 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 3.6827807426452637, + "learning_rate": 4.533810991682799e-06, + "loss": 0.5249, + "step": 323 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 4.125547409057617, + "learning_rate": 4.531005171668093e-06, + "loss": 0.3065, + "step": 324 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 2.935978412628174, + "learning_rate": 4.528191807067074e-06, + "loss": 0.5523, + "step": 325 + }, + { + "epoch": 2.0, + "grad_norm": 2.654388427734375, + "learning_rate": 4.525370908330564e-06, + "loss": 0.4157, + "step": 326 + }, + { + "epoch": 2.0061349693251533, + "grad_norm": 3.213925838470459, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4243, + "step": 327 + }, + { + "epoch": 2.0122699386503067, + "grad_norm": 3.5483286380767822, + "learning_rate": 4.519706550394248e-06, + "loss": 0.4137, + "step": 328 + }, + { + "epoch": 2.01840490797546, + "grad_norm": 3.32084059715271, + "learning_rate": 4.516863112235864e-06, + "loss": 0.5389, + "step": 329 + }, + { + "epoch": 2.0245398773006134, + "grad_norm": 3.427666425704956, + "learning_rate": 4.514012182024756e-06, + "loss": 0.285, + "step": 330 + }, + { + "epoch": 2.0306748466257667, + "grad_norm": 3.3269975185394287, + "learning_rate": 4.511153770351288e-06, + "loss": 0.4877, + "step": 331 + }, + { + "epoch": 2.03680981595092, + "grad_norm": 5.258850574493408, + "learning_rate": 4.508287887833619e-06, + "loss": 0.5168, + "step": 332 + }, + { + "epoch": 2.042944785276074, + "grad_norm": 4.316092491149902, + "learning_rate": 4.505414545117658e-06, + "loss": 0.4791, + "step": 333 + }, + { + "epoch": 2.049079754601227, + "grad_norm": 3.952056884765625, + "learning_rate": 4.502533752877028e-06, + "loss": 0.3014, + "step": 334 + }, + { + "epoch": 2.0552147239263805, + "grad_norm": 4.0617194175720215, + "learning_rate": 4.499645521813024e-06, + "loss": 0.4313, + "step": 335 + }, + { + "epoch": 2.061349693251534, + "grad_norm": 3.7869274616241455, + "learning_rate": 4.496749862654574e-06, + "loss": 0.4807, + "step": 336 + }, + { + "epoch": 2.067484662576687, + "grad_norm": 3.8181991577148438, + "learning_rate": 4.4938467861582e-06, + "loss": 0.4002, + "step": 337 + }, + { + "epoch": 2.0736196319018405, + "grad_norm": 3.8289854526519775, + "learning_rate": 4.490936303107975e-06, + "loss": 0.618, + "step": 338 + }, + { + "epoch": 2.079754601226994, + "grad_norm": 3.121443271636963, + "learning_rate": 4.488018424315488e-06, + "loss": 0.4203, + "step": 339 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 3.141782283782959, + "learning_rate": 4.4850931606198e-06, + "loss": 0.3618, + "step": 340 + }, + { + "epoch": 2.0920245398773005, + "grad_norm": 3.1279287338256836, + "learning_rate": 4.482160522887404e-06, + "loss": 0.4571, + "step": 341 + }, + { + "epoch": 2.098159509202454, + "grad_norm": 3.2418482303619385, + "learning_rate": 4.479220522012185e-06, + "loss": 0.2674, + "step": 342 + }, + { + "epoch": 2.104294478527607, + "grad_norm": 10.230683326721191, + "learning_rate": 4.476273168915382e-06, + "loss": 0.5479, + "step": 343 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 3.588361978530884, + "learning_rate": 4.473318474545544e-06, + "loss": 0.3654, + "step": 344 + }, + { + "epoch": 2.116564417177914, + "grad_norm": 3.0913164615631104, + "learning_rate": 4.470356449878489e-06, + "loss": 0.2704, + "step": 345 + }, + { + "epoch": 2.1226993865030677, + "grad_norm": 3.972447633743286, + "learning_rate": 4.467387105917269e-06, + "loss": 0.3029, + "step": 346 + }, + { + "epoch": 2.128834355828221, + "grad_norm": 3.7174713611602783, + "learning_rate": 4.464410453692122e-06, + "loss": 0.6536, + "step": 347 + }, + { + "epoch": 2.1349693251533743, + "grad_norm": 3.9333994388580322, + "learning_rate": 4.461426504260434e-06, + "loss": 0.3806, + "step": 348 + }, + { + "epoch": 2.1411042944785277, + "grad_norm": 4.752816200256348, + "learning_rate": 4.458435268706699e-06, + "loss": 0.4019, + "step": 349 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 2.505603790283203, + "learning_rate": 4.455436758142477e-06, + "loss": 0.2348, + "step": 350 + }, + { + "epoch": 2.1533742331288344, + "grad_norm": 3.3050570487976074, + "learning_rate": 4.452430983706351e-06, + "loss": 0.505, + "step": 351 + }, + { + "epoch": 2.1595092024539877, + "grad_norm": 5.387442588806152, + "learning_rate": 4.44941795656389e-06, + "loss": 0.399, + "step": 352 + }, + { + "epoch": 2.165644171779141, + "grad_norm": 3.4759480953216553, + "learning_rate": 4.446397687907601e-06, + "loss": 0.5664, + "step": 353 + }, + { + "epoch": 2.1717791411042944, + "grad_norm": 2.949445962905884, + "learning_rate": 4.4433701889568935e-06, + "loss": 0.2128, + "step": 354 + }, + { + "epoch": 2.1779141104294477, + "grad_norm": 3.2884252071380615, + "learning_rate": 4.440335470958035e-06, + "loss": 0.3138, + "step": 355 + }, + { + "epoch": 2.184049079754601, + "grad_norm": 3.1605632305145264, + "learning_rate": 4.437293545184111e-06, + "loss": 0.349, + "step": 356 + }, + { + "epoch": 2.190184049079755, + "grad_norm": 2.9996821880340576, + "learning_rate": 4.434244422934976e-06, + "loss": 0.343, + "step": 357 + }, + { + "epoch": 2.196319018404908, + "grad_norm": 3.6373324394226074, + "learning_rate": 4.431188115537226e-06, + "loss": 0.5656, + "step": 358 + }, + { + "epoch": 2.2024539877300615, + "grad_norm": 4.667621612548828, + "learning_rate": 4.428124634344141e-06, + "loss": 0.2335, + "step": 359 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 3.815484046936035, + "learning_rate": 4.425053990735653e-06, + "loss": 0.2173, + "step": 360 + }, + { + "epoch": 2.214723926380368, + "grad_norm": 4.689478874206543, + "learning_rate": 4.421976196118297e-06, + "loss": 0.5071, + "step": 361 + }, + { + "epoch": 2.2208588957055215, + "grad_norm": 4.016942024230957, + "learning_rate": 4.4188912619251765e-06, + "loss": 0.384, + "step": 362 + }, + { + "epoch": 2.226993865030675, + "grad_norm": 3.5336828231811523, + "learning_rate": 4.415799199615912e-06, + "loss": 0.3133, + "step": 363 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 2.9195592403411865, + "learning_rate": 4.4127000206766055e-06, + "loss": 0.3847, + "step": 364 + }, + { + "epoch": 2.2392638036809815, + "grad_norm": 2.6843531131744385, + "learning_rate": 4.409593736619795e-06, + "loss": 0.3539, + "step": 365 + }, + { + "epoch": 2.245398773006135, + "grad_norm": 2.8692703247070312, + "learning_rate": 4.40648035898441e-06, + "loss": 0.3664, + "step": 366 + }, + { + "epoch": 2.2515337423312882, + "grad_norm": 2.820422649383545, + "learning_rate": 4.403359899335732e-06, + "loss": 0.4606, + "step": 367 + }, + { + "epoch": 2.2576687116564416, + "grad_norm": 3.8641669750213623, + "learning_rate": 4.400232369265351e-06, + "loss": 0.2931, + "step": 368 + }, + { + "epoch": 2.263803680981595, + "grad_norm": 2.75347638130188, + "learning_rate": 4.39709778039112e-06, + "loss": 0.3393, + "step": 369 + }, + { + "epoch": 2.2699386503067487, + "grad_norm": 15.150428771972656, + "learning_rate": 4.393956144357113e-06, + "loss": 0.65, + "step": 370 + }, + { + "epoch": 2.276073619631902, + "grad_norm": 2.4876065254211426, + "learning_rate": 4.390807472833585e-06, + "loss": 0.372, + "step": 371 + }, + { + "epoch": 2.2822085889570554, + "grad_norm": 2.7328054904937744, + "learning_rate": 4.3876517775169216e-06, + "loss": 0.2802, + "step": 372 + }, + { + "epoch": 2.2883435582822087, + "grad_norm": 2.903221368789673, + "learning_rate": 4.384489070129604e-06, + "loss": 0.1964, + "step": 373 + }, + { + "epoch": 2.294478527607362, + "grad_norm": 3.9368724822998047, + "learning_rate": 4.381319362420158e-06, + "loss": 0.4272, + "step": 374 + }, + { + "epoch": 2.3006134969325154, + "grad_norm": 5.431981086730957, + "learning_rate": 4.378142666163114e-06, + "loss": 0.4513, + "step": 375 + }, + { + "epoch": 2.3067484662576687, + "grad_norm": 3.661733627319336, + "learning_rate": 4.374958993158965e-06, + "loss": 0.6087, + "step": 376 + }, + { + "epoch": 2.312883435582822, + "grad_norm": 3.004450559616089, + "learning_rate": 4.371768355234116e-06, + "loss": 0.2206, + "step": 377 + }, + { + "epoch": 2.3190184049079754, + "grad_norm": 4.3785576820373535, + "learning_rate": 4.368570764240852e-06, + "loss": 0.6055, + "step": 378 + }, + { + "epoch": 2.3251533742331287, + "grad_norm": 3.4699394702911377, + "learning_rate": 4.365366232057279e-06, + "loss": 0.6286, + "step": 379 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 2.6862998008728027, + "learning_rate": 4.3621547705872915e-06, + "loss": 0.2622, + "step": 380 + }, + { + "epoch": 2.3374233128834354, + "grad_norm": 3.056382179260254, + "learning_rate": 4.358936391760524e-06, + "loss": 0.3439, + "step": 381 + }, + { + "epoch": 2.3435582822085887, + "grad_norm": 2.6211307048797607, + "learning_rate": 4.355711107532305e-06, + "loss": 0.3677, + "step": 382 + }, + { + "epoch": 2.3496932515337425, + "grad_norm": 2.682060956954956, + "learning_rate": 4.3524789298836175e-06, + "loss": 0.3068, + "step": 383 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 3.482539415359497, + "learning_rate": 4.349239870821049e-06, + "loss": 0.3737, + "step": 384 + }, + { + "epoch": 2.361963190184049, + "grad_norm": 2.8645472526550293, + "learning_rate": 4.345993942376752e-06, + "loss": 0.2837, + "step": 385 + }, + { + "epoch": 2.3680981595092025, + "grad_norm": 3.6142354011535645, + "learning_rate": 4.342741156608392e-06, + "loss": 0.6162, + "step": 386 + }, + { + "epoch": 2.374233128834356, + "grad_norm": 3.0748162269592285, + "learning_rate": 4.3394815255991135e-06, + "loss": 0.2986, + "step": 387 + }, + { + "epoch": 2.3803680981595092, + "grad_norm": 5.090906620025635, + "learning_rate": 4.336215061457485e-06, + "loss": 0.5383, + "step": 388 + }, + { + "epoch": 2.3865030674846626, + "grad_norm": 3.9235823154449463, + "learning_rate": 4.332941776317458e-06, + "loss": 0.4179, + "step": 389 + }, + { + "epoch": 2.392638036809816, + "grad_norm": 3.482926368713379, + "learning_rate": 4.329661682338325e-06, + "loss": 0.3938, + "step": 390 + }, + { + "epoch": 2.3987730061349692, + "grad_norm": 4.274583339691162, + "learning_rate": 4.32637479170467e-06, + "loss": 0.3349, + "step": 391 + }, + { + "epoch": 2.4049079754601226, + "grad_norm": 3.326012372970581, + "learning_rate": 4.323081116626322e-06, + "loss": 0.3336, + "step": 392 + }, + { + "epoch": 2.411042944785276, + "grad_norm": 3.174591541290283, + "learning_rate": 4.319780669338316e-06, + "loss": 0.2983, + "step": 393 + }, + { + "epoch": 2.4171779141104293, + "grad_norm": 3.9073634147644043, + "learning_rate": 4.31647346210084e-06, + "loss": 0.8401, + "step": 394 + }, + { + "epoch": 2.4233128834355826, + "grad_norm": 3.4787721633911133, + "learning_rate": 4.313159507199197e-06, + "loss": 0.2583, + "step": 395 + }, + { + "epoch": 2.4294478527607364, + "grad_norm": 3.19903564453125, + "learning_rate": 4.309838816943755e-06, + "loss": 0.2861, + "step": 396 + }, + { + "epoch": 2.4355828220858897, + "grad_norm": 3.184246778488159, + "learning_rate": 4.306511403669897e-06, + "loss": 0.2956, + "step": 397 + }, + { + "epoch": 2.441717791411043, + "grad_norm": 3.8991878032684326, + "learning_rate": 4.303177279737988e-06, + "loss": 0.5378, + "step": 398 + }, + { + "epoch": 2.4478527607361964, + "grad_norm": 3.411949872970581, + "learning_rate": 4.299836457533313e-06, + "loss": 0.3423, + "step": 399 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 3.445502996444702, + "learning_rate": 4.296488949466046e-06, + "loss": 0.5608, + "step": 400 + }, + { + "epoch": 2.460122699386503, + "grad_norm": 3.066798210144043, + "learning_rate": 4.293134767971193e-06, + "loss": 0.3214, + "step": 401 + }, + { + "epoch": 2.4662576687116564, + "grad_norm": 3.0581583976745605, + "learning_rate": 4.28977392550855e-06, + "loss": 0.5117, + "step": 402 + }, + { + "epoch": 2.4723926380368098, + "grad_norm": 4.207413673400879, + "learning_rate": 4.286406434562659e-06, + "loss": 0.2666, + "step": 403 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 2.9934990406036377, + "learning_rate": 4.283032307642756e-06, + "loss": 0.2878, + "step": 404 + }, + { + "epoch": 2.4846625766871164, + "grad_norm": 3.800593614578247, + "learning_rate": 4.2796515572827305e-06, + "loss": 0.2619, + "step": 405 + }, + { + "epoch": 2.4907975460122698, + "grad_norm": 3.2029523849487305, + "learning_rate": 4.276264196041074e-06, + "loss": 0.1735, + "step": 406 + }, + { + "epoch": 2.4969325153374236, + "grad_norm": 3.515634059906006, + "learning_rate": 4.2728702365008356e-06, + "loss": 0.4741, + "step": 407 + }, + { + "epoch": 2.5030674846625764, + "grad_norm": 3.8354873657226562, + "learning_rate": 4.269469691269577e-06, + "loss": 0.3713, + "step": 408 + }, + { + "epoch": 2.5092024539877302, + "grad_norm": 3.902904510498047, + "learning_rate": 4.266062572979323e-06, + "loss": 0.5189, + "step": 409 + }, + { + "epoch": 2.5153374233128836, + "grad_norm": 3.3276097774505615, + "learning_rate": 4.262648894286515e-06, + "loss": 0.2461, + "step": 410 + }, + { + "epoch": 2.521472392638037, + "grad_norm": 2.9457011222839355, + "learning_rate": 4.259228667871963e-06, + "loss": 0.3013, + "step": 411 + }, + { + "epoch": 2.5276073619631902, + "grad_norm": 2.8941617012023926, + "learning_rate": 4.255801906440803e-06, + "loss": 0.2784, + "step": 412 + }, + { + "epoch": 2.5337423312883436, + "grad_norm": 2.949399471282959, + "learning_rate": 4.252368622722443e-06, + "loss": 0.457, + "step": 413 + }, + { + "epoch": 2.539877300613497, + "grad_norm": 3.342108726501465, + "learning_rate": 4.248928829470522e-06, + "loss": 0.487, + "step": 414 + }, + { + "epoch": 2.5460122699386503, + "grad_norm": 3.9556386470794678, + "learning_rate": 4.245482539462861e-06, + "loss": 0.6118, + "step": 415 + }, + { + "epoch": 2.5521472392638036, + "grad_norm": 3.6936280727386475, + "learning_rate": 4.242029765501411e-06, + "loss": 0.6131, + "step": 416 + }, + { + "epoch": 2.558282208588957, + "grad_norm": 2.79897403717041, + "learning_rate": 4.2385705204122104e-06, + "loss": 0.4209, + "step": 417 + }, + { + "epoch": 2.5644171779141103, + "grad_norm": 4.093318462371826, + "learning_rate": 4.235104817045338e-06, + "loss": 0.5375, + "step": 418 + }, + { + "epoch": 2.5705521472392636, + "grad_norm": 3.138263463973999, + "learning_rate": 4.231632668274861e-06, + "loss": 0.4682, + "step": 419 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 3.1465651988983154, + "learning_rate": 4.22815408699879e-06, + "loss": 0.2522, + "step": 420 + }, + { + "epoch": 2.5828220858895703, + "grad_norm": 3.5166101455688477, + "learning_rate": 4.22466908613903e-06, + "loss": 0.4776, + "step": 421 + }, + { + "epoch": 2.588957055214724, + "grad_norm": 2.8498189449310303, + "learning_rate": 4.221177678641333e-06, + "loss": 0.3067, + "step": 422 + }, + { + "epoch": 2.5950920245398774, + "grad_norm": 2.8046035766601562, + "learning_rate": 4.217679877475251e-06, + "loss": 0.2402, + "step": 423 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 4.204788684844971, + "learning_rate": 4.214175695634084e-06, + "loss": 0.2608, + "step": 424 + }, + { + "epoch": 2.607361963190184, + "grad_norm": 2.5569400787353516, + "learning_rate": 4.210665146134838e-06, + "loss": 0.2801, + "step": 425 + }, + { + "epoch": 2.6134969325153374, + "grad_norm": 3.5359091758728027, + "learning_rate": 4.20714824201817e-06, + "loss": 0.2027, + "step": 426 + }, + { + "epoch": 2.6196319018404908, + "grad_norm": 3.5132668018341064, + "learning_rate": 4.203624996348343e-06, + "loss": 0.4253, + "step": 427 + }, + { + "epoch": 2.625766871165644, + "grad_norm": 3.5076472759246826, + "learning_rate": 4.200095422213177e-06, + "loss": 0.3014, + "step": 428 + }, + { + "epoch": 2.6319018404907975, + "grad_norm": 3.6501238346099854, + "learning_rate": 4.196559532724004e-06, + "loss": 0.6526, + "step": 429 + }, + { + "epoch": 2.638036809815951, + "grad_norm": 2.849924325942993, + "learning_rate": 4.193017341015608e-06, + "loss": 0.4487, + "step": 430 + }, + { + "epoch": 2.644171779141104, + "grad_norm": 3.2228448390960693, + "learning_rate": 4.189468860246192e-06, + "loss": 0.5386, + "step": 431 + }, + { + "epoch": 2.6503067484662575, + "grad_norm": 2.532102108001709, + "learning_rate": 4.185914103597316e-06, + "loss": 0.3034, + "step": 432 + }, + { + "epoch": 2.6564417177914113, + "grad_norm": 2.862720251083374, + "learning_rate": 4.182353084273855e-06, + "loss": 0.5862, + "step": 433 + }, + { + "epoch": 2.662576687116564, + "grad_norm": 3.4617464542388916, + "learning_rate": 4.178785815503946e-06, + "loss": 0.3954, + "step": 434 + }, + { + "epoch": 2.668711656441718, + "grad_norm": 2.627758741378784, + "learning_rate": 4.1752123105389444e-06, + "loss": 0.4367, + "step": 435 + }, + { + "epoch": 2.6748466257668713, + "grad_norm": 3.2868380546569824, + "learning_rate": 4.171632582653368e-06, + "loss": 0.2997, + "step": 436 + }, + { + "epoch": 2.6809815950920246, + "grad_norm": 3.4260897636413574, + "learning_rate": 4.168046645144851e-06, + "loss": 0.3354, + "step": 437 + }, + { + "epoch": 2.687116564417178, + "grad_norm": 3.1415748596191406, + "learning_rate": 4.164454511334098e-06, + "loss": 0.5538, + "step": 438 + }, + { + "epoch": 2.6932515337423313, + "grad_norm": 3.3700919151306152, + "learning_rate": 4.160856194564828e-06, + "loss": 0.5731, + "step": 439 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 3.146968364715576, + "learning_rate": 4.157251708203728e-06, + "loss": 0.4429, + "step": 440 + }, + { + "epoch": 2.705521472392638, + "grad_norm": 3.7495830059051514, + "learning_rate": 4.153641065640402e-06, + "loss": 0.6361, + "step": 441 + }, + { + "epoch": 2.7116564417177913, + "grad_norm": 3.426499128341675, + "learning_rate": 4.150024280287327e-06, + "loss": 0.2418, + "step": 442 + }, + { + "epoch": 2.7177914110429446, + "grad_norm": 3.213719606399536, + "learning_rate": 4.146401365579795e-06, + "loss": 0.2549, + "step": 443 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 3.457742929458618, + "learning_rate": 4.142772334975868e-06, + "loss": 0.3822, + "step": 444 + }, + { + "epoch": 2.7300613496932513, + "grad_norm": 3.130410671234131, + "learning_rate": 4.139137201956324e-06, + "loss": 0.3107, + "step": 445 + }, + { + "epoch": 2.736196319018405, + "grad_norm": 2.7337112426757812, + "learning_rate": 4.1354959800246155e-06, + "loss": 0.2829, + "step": 446 + }, + { + "epoch": 2.7423312883435584, + "grad_norm": 3.427006483078003, + "learning_rate": 4.131848682706807e-06, + "loss": 0.3045, + "step": 447 + }, + { + "epoch": 2.7484662576687118, + "grad_norm": 3.3742318153381348, + "learning_rate": 4.128195323551536e-06, + "loss": 0.316, + "step": 448 + }, + { + "epoch": 2.754601226993865, + "grad_norm": 3.086738109588623, + "learning_rate": 4.1245359161299555e-06, + "loss": 0.5278, + "step": 449 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 3.4609954357147217, + "learning_rate": 4.120870474035687e-06, + "loss": 0.447, + "step": 450 + }, + { + "epoch": 2.766871165644172, + "grad_norm": 3.552663803100586, + "learning_rate": 4.1171990108847705e-06, + "loss": 0.6127, + "step": 451 + }, + { + "epoch": 2.773006134969325, + "grad_norm": 4.413427352905273, + "learning_rate": 4.113521540315609e-06, + "loss": 0.3304, + "step": 452 + }, + { + "epoch": 2.7791411042944785, + "grad_norm": 3.3408143520355225, + "learning_rate": 4.109838075988922e-06, + "loss": 0.5871, + "step": 453 + }, + { + "epoch": 2.785276073619632, + "grad_norm": 3.0659773349761963, + "learning_rate": 4.106148631587697e-06, + "loss": 0.3578, + "step": 454 + }, + { + "epoch": 2.791411042944785, + "grad_norm": 3.2854816913604736, + "learning_rate": 4.102453220817134e-06, + "loss": 0.4685, + "step": 455 + }, + { + "epoch": 2.7975460122699385, + "grad_norm": 3.4940855503082275, + "learning_rate": 4.098751857404595e-06, + "loss": 0.2818, + "step": 456 + }, + { + "epoch": 2.8036809815950923, + "grad_norm": 2.4630730152130127, + "learning_rate": 4.0950445550995566e-06, + "loss": 0.3497, + "step": 457 + }, + { + "epoch": 2.809815950920245, + "grad_norm": 3.3870959281921387, + "learning_rate": 4.091331327673554e-06, + "loss": 0.4954, + "step": 458 + }, + { + "epoch": 2.815950920245399, + "grad_norm": 2.3676836490631104, + "learning_rate": 4.087612188920135e-06, + "loss": 0.3884, + "step": 459 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 3.2477807998657227, + "learning_rate": 4.083887152654804e-06, + "loss": 0.375, + "step": 460 + }, + { + "epoch": 2.8282208588957056, + "grad_norm": 3.295673131942749, + "learning_rate": 4.080156232714976e-06, + "loss": 0.3272, + "step": 461 + }, + { + "epoch": 2.834355828220859, + "grad_norm": 2.800847291946411, + "learning_rate": 4.07641944295992e-06, + "loss": 0.2936, + "step": 462 + }, + { + "epoch": 2.8404907975460123, + "grad_norm": 3.443336009979248, + "learning_rate": 4.072676797270708e-06, + "loss": 0.2363, + "step": 463 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 3.1334242820739746, + "learning_rate": 4.0689283095501684e-06, + "loss": 0.4827, + "step": 464 + }, + { + "epoch": 2.852760736196319, + "grad_norm": 3.950672149658203, + "learning_rate": 4.06517399372283e-06, + "loss": 0.3163, + "step": 465 + }, + { + "epoch": 2.8588957055214723, + "grad_norm": 4.243579387664795, + "learning_rate": 4.061413863734869e-06, + "loss": 0.2827, + "step": 466 + }, + { + "epoch": 2.8650306748466257, + "grad_norm": 4.076017379760742, + "learning_rate": 4.057647933554063e-06, + "loss": 0.3466, + "step": 467 + }, + { + "epoch": 2.871165644171779, + "grad_norm": 2.846989631652832, + "learning_rate": 4.053876217169734e-06, + "loss": 0.4632, + "step": 468 + }, + { + "epoch": 2.8773006134969323, + "grad_norm": 2.74981689453125, + "learning_rate": 4.050098728592698e-06, + "loss": 0.2001, + "step": 469 + }, + { + "epoch": 2.883435582822086, + "grad_norm": 3.062068462371826, + "learning_rate": 4.046315481855211e-06, + "loss": 0.5425, + "step": 470 + }, + { + "epoch": 2.889570552147239, + "grad_norm": 2.8630964756011963, + "learning_rate": 4.0425264910109245e-06, + "loss": 0.424, + "step": 471 + }, + { + "epoch": 2.895705521472393, + "grad_norm": 3.537442922592163, + "learning_rate": 4.03873177013482e-06, + "loss": 0.2443, + "step": 472 + }, + { + "epoch": 2.901840490797546, + "grad_norm": 3.128535270690918, + "learning_rate": 4.034931333323173e-06, + "loss": 0.3734, + "step": 473 + }, + { + "epoch": 2.9079754601226995, + "grad_norm": 3.021897792816162, + "learning_rate": 4.031125194693484e-06, + "loss": 0.3762, + "step": 474 + }, + { + "epoch": 2.914110429447853, + "grad_norm": 3.0943546295166016, + "learning_rate": 4.0273133683844375e-06, + "loss": 0.3721, + "step": 475 + }, + { + "epoch": 2.920245398773006, + "grad_norm": 3.443448305130005, + "learning_rate": 4.023495868555848e-06, + "loss": 0.2868, + "step": 476 + }, + { + "epoch": 2.9263803680981595, + "grad_norm": 2.865227222442627, + "learning_rate": 4.0196727093886024e-06, + "loss": 0.5086, + "step": 477 + }, + { + "epoch": 2.932515337423313, + "grad_norm": 3.1272058486938477, + "learning_rate": 4.015843905084612e-06, + "loss": 0.4616, + "step": 478 + }, + { + "epoch": 2.938650306748466, + "grad_norm": 3.0584447383880615, + "learning_rate": 4.012009469866756e-06, + "loss": 0.403, + "step": 479 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 4.42616081237793, + "learning_rate": 4.008169417978836e-06, + "loss": 0.5801, + "step": 480 + }, + { + "epoch": 2.950920245398773, + "grad_norm": 2.8444535732269287, + "learning_rate": 4.004323763685511e-06, + "loss": 0.5808, + "step": 481 + }, + { + "epoch": 2.957055214723926, + "grad_norm": 2.591719627380371, + "learning_rate": 4.0004725212722565e-06, + "loss": 0.2584, + "step": 482 + }, + { + "epoch": 2.96319018404908, + "grad_norm": 2.5496113300323486, + "learning_rate": 3.996615705045302e-06, + "loss": 0.462, + "step": 483 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 2.9932925701141357, + "learning_rate": 3.992753329331588e-06, + "loss": 0.3502, + "step": 484 + }, + { + "epoch": 2.9754601226993866, + "grad_norm": 3.136871337890625, + "learning_rate": 3.9888854084786995e-06, + "loss": 0.5989, + "step": 485 + }, + { + "epoch": 2.98159509202454, + "grad_norm": 3.6654274463653564, + "learning_rate": 3.985011956854826e-06, + "loss": 0.6772, + "step": 486 + }, + { + "epoch": 2.9877300613496933, + "grad_norm": 2.5398948192596436, + "learning_rate": 3.9811329888487004e-06, + "loss": 0.4192, + "step": 487 + }, + { + "epoch": 2.9938650306748467, + "grad_norm": 4.89943790435791, + "learning_rate": 3.977248518869545e-06, + "loss": 0.4031, + "step": 488 + }, + { + "epoch": 3.0, + "grad_norm": 3.4729995727539062, + "learning_rate": 3.973358561347024e-06, + "loss": 0.7764, + "step": 489 + }, + { + "epoch": 3.0061349693251533, + "grad_norm": 5.331607818603516, + "learning_rate": 3.969463130731183e-06, + "loss": 0.3267, + "step": 490 + }, + { + "epoch": 3.0122699386503067, + "grad_norm": 3.453650712966919, + "learning_rate": 3.965562241492401e-06, + "loss": 0.2719, + "step": 491 + }, + { + "epoch": 3.01840490797546, + "grad_norm": 3.232313632965088, + "learning_rate": 3.9616559081213335e-06, + "loss": 0.1825, + "step": 492 + }, + { + "epoch": 3.0245398773006134, + "grad_norm": 3.4860260486602783, + "learning_rate": 3.957744145128858e-06, + "loss": 0.1854, + "step": 493 + }, + { + "epoch": 3.0306748466257667, + "grad_norm": 3.4357805252075195, + "learning_rate": 3.953826967046021e-06, + "loss": 0.2224, + "step": 494 + }, + { + "epoch": 3.03680981595092, + "grad_norm": 4.557503700256348, + "learning_rate": 3.9499043884239894e-06, + "loss": 0.349, + "step": 495 + }, + { + "epoch": 3.042944785276074, + "grad_norm": 4.685214042663574, + "learning_rate": 3.945976423833987e-06, + "loss": 0.175, + "step": 496 + }, + { + "epoch": 3.049079754601227, + "grad_norm": 3.7430171966552734, + "learning_rate": 3.942043087867244e-06, + "loss": 0.2773, + "step": 497 + }, + { + "epoch": 3.0552147239263805, + "grad_norm": 3.756450653076172, + "learning_rate": 3.938104395134947e-06, + "loss": 0.4445, + "step": 498 + }, + { + "epoch": 3.061349693251534, + "grad_norm": 4.049175262451172, + "learning_rate": 3.9341603602681805e-06, + "loss": 0.3046, + "step": 499 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 3.7689461708068848, + "learning_rate": 3.930210997917871e-06, + "loss": 0.2544, + "step": 500 + }, + { + "epoch": 3.0736196319018405, + "grad_norm": 4.027602195739746, + "learning_rate": 3.92625632275474e-06, + "loss": 0.3154, + "step": 501 + }, + { + "epoch": 3.079754601226994, + "grad_norm": 2.8449292182922363, + "learning_rate": 3.922296349469239e-06, + "loss": 0.2804, + "step": 502 + }, + { + "epoch": 3.085889570552147, + "grad_norm": 2.9555234909057617, + "learning_rate": 3.918331092771505e-06, + "loss": 0.2393, + "step": 503 + }, + { + "epoch": 3.0920245398773005, + "grad_norm": 2.621042013168335, + "learning_rate": 3.914360567391296e-06, + "loss": 0.1403, + "step": 504 + }, + { + "epoch": 3.098159509202454, + "grad_norm": 3.2348620891571045, + "learning_rate": 3.910384788077949e-06, + "loss": 0.1537, + "step": 505 + }, + { + "epoch": 3.104294478527607, + "grad_norm": 3.030179977416992, + "learning_rate": 3.906403769600311e-06, + "loss": 0.2921, + "step": 506 + }, + { + "epoch": 3.1104294478527605, + "grad_norm": 3.146428346633911, + "learning_rate": 3.902417526746694e-06, + "loss": 0.2036, + "step": 507 + }, + { + "epoch": 3.116564417177914, + "grad_norm": 3.6201512813568115, + "learning_rate": 3.898426074324818e-06, + "loss": 0.2655, + "step": 508 + }, + { + "epoch": 3.1226993865030677, + "grad_norm": 3.7674012184143066, + "learning_rate": 3.8944294271617524e-06, + "loss": 0.3938, + "step": 509 + }, + { + "epoch": 3.128834355828221, + "grad_norm": 4.54722785949707, + "learning_rate": 3.890427600103865e-06, + "loss": 0.3051, + "step": 510 + }, + { + "epoch": 3.1349693251533743, + "grad_norm": 4.228236675262451, + "learning_rate": 3.886420608016767e-06, + "loss": 0.3719, + "step": 511 + }, + { + "epoch": 3.1411042944785277, + "grad_norm": 4.355110168457031, + "learning_rate": 3.882408465785252e-06, + "loss": 0.1863, + "step": 512 + }, + { + "epoch": 3.147239263803681, + "grad_norm": 3.451460838317871, + "learning_rate": 3.878391188313249e-06, + "loss": 0.1479, + "step": 513 + }, + { + "epoch": 3.1533742331288344, + "grad_norm": 4.395524501800537, + "learning_rate": 3.87436879052376e-06, + "loss": 0.238, + "step": 514 + }, + { + "epoch": 3.1595092024539877, + "grad_norm": 2.940717935562134, + "learning_rate": 3.870341287358809e-06, + "loss": 0.2069, + "step": 515 + }, + { + "epoch": 3.165644171779141, + "grad_norm": 2.5817320346832275, + "learning_rate": 3.8663086937793845e-06, + "loss": 0.1189, + "step": 516 + }, + { + "epoch": 3.1717791411042944, + "grad_norm": 3.9863343238830566, + "learning_rate": 3.862271024765385e-06, + "loss": 0.3434, + "step": 517 + }, + { + "epoch": 3.1779141104294477, + "grad_norm": 3.609004259109497, + "learning_rate": 3.8582282953155626e-06, + "loss": 0.1602, + "step": 518 + }, + { + "epoch": 3.184049079754601, + "grad_norm": 3.207533121109009, + "learning_rate": 3.854180520447465e-06, + "loss": 0.3452, + "step": 519 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 3.593388795852661, + "learning_rate": 3.850127715197387e-06, + "loss": 0.2832, + "step": 520 + }, + { + "epoch": 3.196319018404908, + "grad_norm": 3.409064531326294, + "learning_rate": 3.846069894620306e-06, + "loss": 0.1481, + "step": 521 + }, + { + "epoch": 3.2024539877300615, + "grad_norm": 3.461498737335205, + "learning_rate": 3.84200707378983e-06, + "loss": 0.1283, + "step": 522 + }, + { + "epoch": 3.208588957055215, + "grad_norm": 3.708467483520508, + "learning_rate": 3.8379392677981434e-06, + "loss": 0.2468, + "step": 523 + }, + { + "epoch": 3.214723926380368, + "grad_norm": 2.802381753921509, + "learning_rate": 3.833866491755947e-06, + "loss": 0.2685, + "step": 524 + }, + { + "epoch": 3.2208588957055215, + "grad_norm": 3.0787744522094727, + "learning_rate": 3.8297887607924044e-06, + "loss": 0.2595, + "step": 525 + }, + { + "epoch": 3.226993865030675, + "grad_norm": 3.3952548503875732, + "learning_rate": 3.825706090055088e-06, + "loss": 0.4099, + "step": 526 + }, + { + "epoch": 3.233128834355828, + "grad_norm": 3.3497085571289062, + "learning_rate": 3.821618494709916e-06, + "loss": 0.287, + "step": 527 + }, + { + "epoch": 3.2392638036809815, + "grad_norm": 4.050611972808838, + "learning_rate": 3.817525989941102e-06, + "loss": 0.2369, + "step": 528 + }, + { + "epoch": 3.245398773006135, + "grad_norm": 2.87642240524292, + "learning_rate": 3.8134285909510972e-06, + "loss": 0.2751, + "step": 529 + }, + { + "epoch": 3.2515337423312882, + "grad_norm": 3.821941614151001, + "learning_rate": 3.8093263129605305e-06, + "loss": 0.2363, + "step": 530 + }, + { + "epoch": 3.2576687116564416, + "grad_norm": 2.8066117763519287, + "learning_rate": 3.80521917120816e-06, + "loss": 0.094, + "step": 531 + }, + { + "epoch": 3.263803680981595, + "grad_norm": 3.849768877029419, + "learning_rate": 3.801107180950806e-06, + "loss": 0.4117, + "step": 532 + }, + { + "epoch": 3.2699386503067487, + "grad_norm": 2.4161250591278076, + "learning_rate": 3.7969903574633028e-06, + "loss": 0.1183, + "step": 533 + }, + { + "epoch": 3.276073619631902, + "grad_norm": 3.6743111610412598, + "learning_rate": 3.792868716038437e-06, + "loss": 0.2296, + "step": 534 + }, + { + "epoch": 3.2822085889570554, + "grad_norm": 4.378123760223389, + "learning_rate": 3.7887422719868937e-06, + "loss": 0.2678, + "step": 535 + }, + { + "epoch": 3.2883435582822087, + "grad_norm": 4.816481590270996, + "learning_rate": 3.784611040637198e-06, + "loss": 0.4887, + "step": 536 + }, + { + "epoch": 3.294478527607362, + "grad_norm": 3.5712430477142334, + "learning_rate": 3.7804750373356576e-06, + "loss": 0.3827, + "step": 537 + }, + { + "epoch": 3.3006134969325154, + "grad_norm": 3.6877355575561523, + "learning_rate": 3.776334277446307e-06, + "loss": 0.3233, + "step": 538 + }, + { + "epoch": 3.3067484662576687, + "grad_norm": 3.442706346511841, + "learning_rate": 3.7721887763508512e-06, + "loss": 0.1256, + "step": 539 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 3.9265615940093994, + "learning_rate": 3.7680385494486053e-06, + "loss": 0.3845, + "step": 540 + }, + { + "epoch": 3.3190184049079754, + "grad_norm": 3.5030126571655273, + "learning_rate": 3.7638836121564414e-06, + "loss": 0.2905, + "step": 541 + }, + { + "epoch": 3.3251533742331287, + "grad_norm": 3.6685378551483154, + "learning_rate": 3.7597239799087283e-06, + "loss": 0.3561, + "step": 542 + }, + { + "epoch": 3.331288343558282, + "grad_norm": 3.8484046459198, + "learning_rate": 3.7555596681572736e-06, + "loss": 0.1157, + "step": 543 + }, + { + "epoch": 3.3374233128834354, + "grad_norm": 3.7977402210235596, + "learning_rate": 3.751390692371272e-06, + "loss": 0.3049, + "step": 544 + }, + { + "epoch": 3.3435582822085887, + "grad_norm": 3.4409852027893066, + "learning_rate": 3.7472170680372398e-06, + "loss": 0.1626, + "step": 545 + }, + { + "epoch": 3.3496932515337425, + "grad_norm": 3.801541328430176, + "learning_rate": 3.7430388106589632e-06, + "loss": 0.2414, + "step": 546 + }, + { + "epoch": 3.355828220858896, + "grad_norm": 4.025203704833984, + "learning_rate": 3.738855935757438e-06, + "loss": 0.3441, + "step": 547 + }, + { + "epoch": 3.361963190184049, + "grad_norm": 4.242798805236816, + "learning_rate": 3.7346684588708135e-06, + "loss": 0.5244, + "step": 548 + }, + { + "epoch": 3.3680981595092025, + "grad_norm": 3.0516819953918457, + "learning_rate": 3.7304763955543332e-06, + "loss": 0.1984, + "step": 549 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 3.894667625427246, + "learning_rate": 3.726279761380279e-06, + "loss": 0.2715, + "step": 550 + }, + { + "epoch": 3.3803680981595092, + "grad_norm": 3.171208143234253, + "learning_rate": 3.72207857193791e-06, + "loss": 0.1537, + "step": 551 + }, + { + "epoch": 3.3865030674846626, + "grad_norm": 4.344860553741455, + "learning_rate": 3.7178728428334092e-06, + "loss": 0.2388, + "step": 552 + }, + { + "epoch": 3.392638036809816, + "grad_norm": 2.766317367553711, + "learning_rate": 3.7136625896898226e-06, + "loss": 0.1726, + "step": 553 + }, + { + "epoch": 3.3987730061349692, + "grad_norm": 3.550662040710449, + "learning_rate": 3.7094478281470003e-06, + "loss": 0.2942, + "step": 554 + }, + { + "epoch": 3.4049079754601226, + "grad_norm": 3.4576945304870605, + "learning_rate": 3.7052285738615412e-06, + "loss": 0.1665, + "step": 555 + }, + { + "epoch": 3.411042944785276, + "grad_norm": 4.026793003082275, + "learning_rate": 3.7010048425067317e-06, + "loss": 0.3954, + "step": 556 + }, + { + "epoch": 3.4171779141104293, + "grad_norm": 4.600133419036865, + "learning_rate": 3.696776649772492e-06, + "loss": 0.3207, + "step": 557 + }, + { + "epoch": 3.4233128834355826, + "grad_norm": 4.747331142425537, + "learning_rate": 3.692544011365312e-06, + "loss": 0.1325, + "step": 558 + }, + { + "epoch": 3.4294478527607364, + "grad_norm": 3.781464099884033, + "learning_rate": 3.6883069430081986e-06, + "loss": 0.1644, + "step": 559 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 2.905986785888672, + "learning_rate": 3.6840654604406135e-06, + "loss": 0.2469, + "step": 560 + }, + { + "epoch": 3.441717791411043, + "grad_norm": 2.3747711181640625, + "learning_rate": 3.679819579418414e-06, + "loss": 0.1146, + "step": 561 + }, + { + "epoch": 3.4478527607361964, + "grad_norm": 3.2683632373809814, + "learning_rate": 3.6755693157137995e-06, + "loss": 0.3236, + "step": 562 + }, + { + "epoch": 3.4539877300613497, + "grad_norm": 3.7750496864318848, + "learning_rate": 3.6713146851152487e-06, + "loss": 0.399, + "step": 563 + }, + { + "epoch": 3.460122699386503, + "grad_norm": 3.3912384510040283, + "learning_rate": 3.667055703427461e-06, + "loss": 0.1259, + "step": 564 + }, + { + "epoch": 3.4662576687116564, + "grad_norm": 3.0224430561065674, + "learning_rate": 3.6627923864713e-06, + "loss": 0.1835, + "step": 565 + }, + { + "epoch": 3.4723926380368098, + "grad_norm": 3.642258405685425, + "learning_rate": 3.658524750083733e-06, + "loss": 0.2763, + "step": 566 + }, + { + "epoch": 3.478527607361963, + "grad_norm": 3.409890651702881, + "learning_rate": 3.654252810117773e-06, + "loss": 0.2496, + "step": 567 + }, + { + "epoch": 3.4846625766871164, + "grad_norm": 3.0416476726531982, + "learning_rate": 3.6499765824424195e-06, + "loss": 0.1287, + "step": 568 + }, + { + "epoch": 3.4907975460122698, + "grad_norm": 3.1963987350463867, + "learning_rate": 3.6456960829425987e-06, + "loss": 0.1747, + "step": 569 + }, + { + "epoch": 3.4969325153374236, + "grad_norm": 3.198448657989502, + "learning_rate": 3.641411327519107e-06, + "loss": 0.1913, + "step": 570 + }, + { + "epoch": 3.5030674846625764, + "grad_norm": 3.7023441791534424, + "learning_rate": 3.6371223320885492e-06, + "loss": 0.3224, + "step": 571 + }, + { + "epoch": 3.5092024539877302, + "grad_norm": 4.54288387298584, + "learning_rate": 3.6328291125832803e-06, + "loss": 0.2364, + "step": 572 + }, + { + "epoch": 3.5153374233128836, + "grad_norm": 3.5064890384674072, + "learning_rate": 3.628531684951347e-06, + "loss": 0.2552, + "step": 573 + }, + { + "epoch": 3.521472392638037, + "grad_norm": 3.987583875656128, + "learning_rate": 3.6242300651564276e-06, + "loss": 0.3232, + "step": 574 + }, + { + "epoch": 3.5276073619631902, + "grad_norm": 3.179642915725708, + "learning_rate": 3.6199242691777745e-06, + "loss": 0.32, + "step": 575 + }, + { + "epoch": 3.5337423312883436, + "grad_norm": 3.3078157901763916, + "learning_rate": 3.6156143130101516e-06, + "loss": 0.2922, + "step": 576 + }, + { + "epoch": 3.539877300613497, + "grad_norm": 3.1628613471984863, + "learning_rate": 3.6113002126637765e-06, + "loss": 0.2005, + "step": 577 + }, + { + "epoch": 3.5460122699386503, + "grad_norm": 3.4515540599823, + "learning_rate": 3.606981984164263e-06, + "loss": 0.2138, + "step": 578 + }, + { + "epoch": 3.5521472392638036, + "grad_norm": 5.132473945617676, + "learning_rate": 3.6026596435525578e-06, + "loss": 0.4382, + "step": 579 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 3.397614002227783, + "learning_rate": 3.5983332068848855e-06, + "loss": 0.3326, + "step": 580 + }, + { + "epoch": 3.5644171779141103, + "grad_norm": 4.79497766494751, + "learning_rate": 3.5940026902326825e-06, + "loss": 0.4748, + "step": 581 + }, + { + "epoch": 3.5705521472392636, + "grad_norm": 3.7675018310546875, + "learning_rate": 3.5896681096825446e-06, + "loss": 0.2692, + "step": 582 + }, + { + "epoch": 3.5766871165644174, + "grad_norm": 3.0637521743774414, + "learning_rate": 3.5853294813361614e-06, + "loss": 0.3658, + "step": 583 + }, + { + "epoch": 3.5828220858895703, + "grad_norm": 2.8949790000915527, + "learning_rate": 3.5809868213102623e-06, + "loss": 0.1661, + "step": 584 + }, + { + "epoch": 3.588957055214724, + "grad_norm": 3.163419246673584, + "learning_rate": 3.5766401457365485e-06, + "loss": 0.1233, + "step": 585 + }, + { + "epoch": 3.5950920245398774, + "grad_norm": 3.1787965297698975, + "learning_rate": 3.5722894707616417e-06, + "loss": 0.278, + "step": 586 + }, + { + "epoch": 3.6012269938650308, + "grad_norm": 2.9397857189178467, + "learning_rate": 3.5679348125470175e-06, + "loss": 0.1541, + "step": 587 + }, + { + "epoch": 3.607361963190184, + "grad_norm": 3.2690396308898926, + "learning_rate": 3.56357618726895e-06, + "loss": 0.1575, + "step": 588 + }, + { + "epoch": 3.6134969325153374, + "grad_norm": 5.444014072418213, + "learning_rate": 3.5592136111184483e-06, + "loss": 0.8079, + "step": 589 + }, + { + "epoch": 3.6196319018404908, + "grad_norm": 3.1688313484191895, + "learning_rate": 3.554847100301199e-06, + "loss": 0.341, + "step": 590 + }, + { + "epoch": 3.625766871165644, + "grad_norm": 2.469212532043457, + "learning_rate": 3.550476671037505e-06, + "loss": 0.1625, + "step": 591 + }, + { + "epoch": 3.6319018404907975, + "grad_norm": 3.3956527709960938, + "learning_rate": 3.546102339562223e-06, + "loss": 0.199, + "step": 592 + }, + { + "epoch": 3.638036809815951, + "grad_norm": 2.7287702560424805, + "learning_rate": 3.5417241221247078e-06, + "loss": 0.1493, + "step": 593 + }, + { + "epoch": 3.644171779141104, + "grad_norm": 3.5046865940093994, + "learning_rate": 3.5373420349887477e-06, + "loss": 0.2765, + "step": 594 + }, + { + "epoch": 3.6503067484662575, + "grad_norm": 3.121476650238037, + "learning_rate": 3.5329560944325065e-06, + "loss": 0.2833, + "step": 595 + }, + { + "epoch": 3.6564417177914113, + "grad_norm": 3.276463270187378, + "learning_rate": 3.528566316748462e-06, + "loss": 0.1237, + "step": 596 + }, + { + "epoch": 3.662576687116564, + "grad_norm": 3.382840633392334, + "learning_rate": 3.524172718243347e-06, + "loss": 0.1599, + "step": 597 + }, + { + "epoch": 3.668711656441718, + "grad_norm": 4.801311492919922, + "learning_rate": 3.5197753152380854e-06, + "loss": 0.2997, + "step": 598 + }, + { + "epoch": 3.6748466257668713, + "grad_norm": 4.117336273193359, + "learning_rate": 3.515374124067736e-06, + "loss": 0.2021, + "step": 599 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 3.611438035964966, + "learning_rate": 3.5109691610814263e-06, + "loss": 0.1726, + "step": 600 + }, + { + "epoch": 3.687116564417178, + "grad_norm": 4.5179972648620605, + "learning_rate": 3.5065604426422995e-06, + "loss": 0.1377, + "step": 601 + }, + { + "epoch": 3.6932515337423313, + "grad_norm": 3.561061382293701, + "learning_rate": 3.502147985127445e-06, + "loss": 0.1497, + "step": 602 + }, + { + "epoch": 3.6993865030674846, + "grad_norm": 3.3497917652130127, + "learning_rate": 3.4977318049278443e-06, + "loss": 0.1589, + "step": 603 + }, + { + "epoch": 3.705521472392638, + "grad_norm": 3.2725470066070557, + "learning_rate": 3.4933119184483065e-06, + "loss": 0.1364, + "step": 604 + }, + { + "epoch": 3.7116564417177913, + "grad_norm": 3.228956460952759, + "learning_rate": 3.4888883421074076e-06, + "loss": 0.177, + "step": 605 + }, + { + "epoch": 3.7177914110429446, + "grad_norm": 3.7648911476135254, + "learning_rate": 3.484461092337434e-06, + "loss": 0.122, + "step": 606 + }, + { + "epoch": 3.7239263803680984, + "grad_norm": 3.5322585105895996, + "learning_rate": 3.4800301855843137e-06, + "loss": 0.2664, + "step": 607 + }, + { + "epoch": 3.7300613496932513, + "grad_norm": 2.951073169708252, + "learning_rate": 3.4755956383075613e-06, + "loss": 0.12, + "step": 608 + }, + { + "epoch": 3.736196319018405, + "grad_norm": 3.0577664375305176, + "learning_rate": 3.471157466980214e-06, + "loss": 0.3926, + "step": 609 + }, + { + "epoch": 3.7423312883435584, + "grad_norm": 4.089846134185791, + "learning_rate": 3.466715688088772e-06, + "loss": 0.6233, + "step": 610 + }, + { + "epoch": 3.7484662576687118, + "grad_norm": 3.081340789794922, + "learning_rate": 3.462270318133136e-06, + "loss": 0.2456, + "step": 611 + }, + { + "epoch": 3.754601226993865, + "grad_norm": 3.034712553024292, + "learning_rate": 3.4578213736265474e-06, + "loss": 0.2683, + "step": 612 + }, + { + "epoch": 3.7607361963190185, + "grad_norm": 3.459815740585327, + "learning_rate": 3.4533688710955255e-06, + "loss": 0.3796, + "step": 613 + }, + { + "epoch": 3.766871165644172, + "grad_norm": 3.523737907409668, + "learning_rate": 3.448912827079805e-06, + "loss": 0.3326, + "step": 614 + }, + { + "epoch": 3.773006134969325, + "grad_norm": 3.333219289779663, + "learning_rate": 3.4444532581322793e-06, + "loss": 0.206, + "step": 615 + }, + { + "epoch": 3.7791411042944785, + "grad_norm": 3.582387685775757, + "learning_rate": 3.4399901808189327e-06, + "loss": 0.244, + "step": 616 + }, + { + "epoch": 3.785276073619632, + "grad_norm": 3.4887266159057617, + "learning_rate": 3.435523611718785e-06, + "loss": 0.1796, + "step": 617 + }, + { + "epoch": 3.791411042944785, + "grad_norm": 4.89408016204834, + "learning_rate": 3.4310535674238242e-06, + "loss": 0.188, + "step": 618 + }, + { + "epoch": 3.7975460122699385, + "grad_norm": 4.338910102844238, + "learning_rate": 3.42658006453895e-06, + "loss": 0.3039, + "step": 619 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 4.107708930969238, + "learning_rate": 3.4221031196819083e-06, + "loss": 0.3383, + "step": 620 + }, + { + "epoch": 3.809815950920245, + "grad_norm": 3.698777675628662, + "learning_rate": 3.4176227494832305e-06, + "loss": 0.1721, + "step": 621 + }, + { + "epoch": 3.815950920245399, + "grad_norm": 2.6659226417541504, + "learning_rate": 3.413138970586174e-06, + "loss": 0.2211, + "step": 622 + }, + { + "epoch": 3.8220858895705523, + "grad_norm": 3.2398436069488525, + "learning_rate": 3.4086517996466574e-06, + "loss": 0.1871, + "step": 623 + }, + { + "epoch": 3.8282208588957056, + "grad_norm": 4.9128804206848145, + "learning_rate": 3.404161253333199e-06, + "loss": 0.3874, + "step": 624 + }, + { + "epoch": 3.834355828220859, + "grad_norm": 3.508789300918579, + "learning_rate": 3.3996673483268573e-06, + "loss": 0.1739, + "step": 625 + }, + { + "epoch": 3.8404907975460123, + "grad_norm": 3.3016927242279053, + "learning_rate": 3.3951701013211665e-06, + "loss": 0.274, + "step": 626 + }, + { + "epoch": 3.8466257668711656, + "grad_norm": 3.8941333293914795, + "learning_rate": 3.3906695290220736e-06, + "loss": 0.3568, + "step": 627 + }, + { + "epoch": 3.852760736196319, + "grad_norm": 3.512354850769043, + "learning_rate": 3.3861656481478816e-06, + "loss": 0.157, + "step": 628 + }, + { + "epoch": 3.8588957055214723, + "grad_norm": 3.482649326324463, + "learning_rate": 3.3816584754291814e-06, + "loss": 0.1218, + "step": 629 + }, + { + "epoch": 3.8650306748466257, + "grad_norm": 3.1490275859832764, + "learning_rate": 3.377148027608793e-06, + "loss": 0.2234, + "step": 630 + }, + { + "epoch": 3.871165644171779, + "grad_norm": 3.2172653675079346, + "learning_rate": 3.3726343214417023e-06, + "loss": 0.3329, + "step": 631 + }, + { + "epoch": 3.8773006134969323, + "grad_norm": 4.167707443237305, + "learning_rate": 3.3681173736949984e-06, + "loss": 0.1384, + "step": 632 + }, + { + "epoch": 3.883435582822086, + "grad_norm": 3.4743919372558594, + "learning_rate": 3.3635972011478134e-06, + "loss": 0.3807, + "step": 633 + }, + { + "epoch": 3.889570552147239, + "grad_norm": 3.6892173290252686, + "learning_rate": 3.3590738205912566e-06, + "loss": 0.194, + "step": 634 + }, + { + "epoch": 3.895705521472393, + "grad_norm": 3.262967824935913, + "learning_rate": 3.354547248828356e-06, + "loss": 0.202, + "step": 635 + }, + { + "epoch": 3.901840490797546, + "grad_norm": 3.8871562480926514, + "learning_rate": 3.3500175026739916e-06, + "loss": 0.2471, + "step": 636 + }, + { + "epoch": 3.9079754601226995, + "grad_norm": 3.5097084045410156, + "learning_rate": 3.3454845989548385e-06, + "loss": 0.1112, + "step": 637 + }, + { + "epoch": 3.914110429447853, + "grad_norm": 4.163944721221924, + "learning_rate": 3.3409485545092995e-06, + "loss": 0.3368, + "step": 638 + }, + { + "epoch": 3.920245398773006, + "grad_norm": 3.6405045986175537, + "learning_rate": 3.336409386187444e-06, + "loss": 0.1863, + "step": 639 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 3.2477526664733887, + "learning_rate": 3.331867110850946e-06, + "loss": 0.1491, + "step": 640 + }, + { + "epoch": 3.932515337423313, + "grad_norm": 3.933753490447998, + "learning_rate": 3.327321745373021e-06, + "loss": 0.2484, + "step": 641 + }, + { + "epoch": 3.938650306748466, + "grad_norm": 3.2475059032440186, + "learning_rate": 3.322773306638364e-06, + "loss": 0.2126, + "step": 642 + }, + { + "epoch": 3.9447852760736195, + "grad_norm": 2.628467321395874, + "learning_rate": 3.318221811543086e-06, + "loss": 0.1649, + "step": 643 + }, + { + "epoch": 3.950920245398773, + "grad_norm": 3.2612411975860596, + "learning_rate": 3.313667276994651e-06, + "loss": 0.1442, + "step": 644 + }, + { + "epoch": 3.957055214723926, + "grad_norm": 3.8058395385742188, + "learning_rate": 3.309109719911814e-06, + "loss": 0.359, + "step": 645 + }, + { + "epoch": 3.96319018404908, + "grad_norm": 3.3450071811676025, + "learning_rate": 3.304549157224558e-06, + "loss": 0.4042, + "step": 646 + }, + { + "epoch": 3.969325153374233, + "grad_norm": 3.079601287841797, + "learning_rate": 3.299985605874031e-06, + "loss": 0.1699, + "step": 647 + }, + { + "epoch": 3.9754601226993866, + "grad_norm": 3.8963980674743652, + "learning_rate": 3.295419082812483e-06, + "loss": 0.1888, + "step": 648 + }, + { + "epoch": 3.98159509202454, + "grad_norm": 3.307405948638916, + "learning_rate": 3.2908496050032024e-06, + "loss": 0.2824, + "step": 649 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 3.227478265762329, + "learning_rate": 3.2862771894204544e-06, + "loss": 0.3038, + "step": 650 + }, + { + "epoch": 3.9938650306748467, + "grad_norm": 4.046506881713867, + "learning_rate": 3.2817018530494164e-06, + "loss": 0.3266, + "step": 651 + }, + { + "epoch": 4.0, + "grad_norm": 7.775874614715576, + "learning_rate": 3.277123612886116e-06, + "loss": 0.2998, + "step": 652 + }, + { + "epoch": 4.006134969325154, + "grad_norm": 3.146462917327881, + "learning_rate": 3.272542485937369e-06, + "loss": 0.2764, + "step": 653 + }, + { + "epoch": 4.012269938650307, + "grad_norm": 3.0539863109588623, + "learning_rate": 3.2679584892207118e-06, + "loss": 0.1157, + "step": 654 + }, + { + "epoch": 4.0184049079754605, + "grad_norm": 3.634021520614624, + "learning_rate": 3.263371639764343e-06, + "loss": 0.0707, + "step": 655 + }, + { + "epoch": 4.024539877300613, + "grad_norm": 3.3474650382995605, + "learning_rate": 3.2587819546070596e-06, + "loss": 0.1067, + "step": 656 + }, + { + "epoch": 4.030674846625767, + "grad_norm": 4.409244537353516, + "learning_rate": 3.254189450798189e-06, + "loss": 0.0564, + "step": 657 + }, + { + "epoch": 4.03680981595092, + "grad_norm": 3.0446252822875977, + "learning_rate": 3.2495941453975312e-06, + "loss": 0.0535, + "step": 658 + }, + { + "epoch": 4.042944785276074, + "grad_norm": 4.014753818511963, + "learning_rate": 3.2449960554752935e-06, + "loss": 0.1245, + "step": 659 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 3.188062906265259, + "learning_rate": 3.240395198112026e-06, + "loss": 0.0626, + "step": 660 + }, + { + "epoch": 4.0552147239263805, + "grad_norm": 3.006086826324463, + "learning_rate": 3.2357915903985605e-06, + "loss": 0.1198, + "step": 661 + }, + { + "epoch": 4.061349693251533, + "grad_norm": 2.8865551948547363, + "learning_rate": 3.2311852494359423e-06, + "loss": 0.0454, + "step": 662 + }, + { + "epoch": 4.067484662576687, + "grad_norm": 4.2888007164001465, + "learning_rate": 3.226576192335373e-06, + "loss": 0.2064, + "step": 663 + }, + { + "epoch": 4.07361963190184, + "grad_norm": 3.1414525508880615, + "learning_rate": 3.2219644362181436e-06, + "loss": 0.2183, + "step": 664 + }, + { + "epoch": 4.079754601226994, + "grad_norm": 2.556277275085449, + "learning_rate": 3.21734999821557e-06, + "loss": 0.0516, + "step": 665 + }, + { + "epoch": 4.085889570552148, + "grad_norm": 2.698118209838867, + "learning_rate": 3.2127328954689307e-06, + "loss": 0.0613, + "step": 666 + }, + { + "epoch": 4.0920245398773005, + "grad_norm": 2.869919538497925, + "learning_rate": 3.2081131451294025e-06, + "loss": 0.0583, + "step": 667 + }, + { + "epoch": 4.098159509202454, + "grad_norm": 3.8786919116973877, + "learning_rate": 3.2034907643579988e-06, + "loss": 0.0766, + "step": 668 + }, + { + "epoch": 4.104294478527607, + "grad_norm": 4.224637031555176, + "learning_rate": 3.1988657703255043e-06, + "loss": 0.1099, + "step": 669 + }, + { + "epoch": 4.110429447852761, + "grad_norm": 4.671669006347656, + "learning_rate": 3.194238180212409e-06, + "loss": 0.1663, + "step": 670 + }, + { + "epoch": 4.116564417177914, + "grad_norm": 3.2484257221221924, + "learning_rate": 3.1896080112088477e-06, + "loss": 0.0587, + "step": 671 + }, + { + "epoch": 4.122699386503068, + "grad_norm": 2.4808075428009033, + "learning_rate": 3.184975280514536e-06, + "loss": 0.0579, + "step": 672 + }, + { + "epoch": 4.128834355828221, + "grad_norm": 3.7106919288635254, + "learning_rate": 3.1803400053387044e-06, + "loss": 0.1083, + "step": 673 + }, + { + "epoch": 4.134969325153374, + "grad_norm": 3.008970260620117, + "learning_rate": 3.175702202900036e-06, + "loss": 0.1355, + "step": 674 + }, + { + "epoch": 4.141104294478527, + "grad_norm": 3.2640793323516846, + "learning_rate": 3.1710618904266006e-06, + "loss": 0.092, + "step": 675 + }, + { + "epoch": 4.147239263803681, + "grad_norm": 3.08042049407959, + "learning_rate": 3.166419085155793e-06, + "loss": 0.0563, + "step": 676 + }, + { + "epoch": 4.153374233128835, + "grad_norm": 2.993530511856079, + "learning_rate": 3.1617738043342695e-06, + "loss": 0.1773, + "step": 677 + }, + { + "epoch": 4.159509202453988, + "grad_norm": 2.6218204498291016, + "learning_rate": 3.157126065217879e-06, + "loss": 0.0489, + "step": 678 + }, + { + "epoch": 4.1656441717791415, + "grad_norm": 4.3173723220825195, + "learning_rate": 3.152475885071606e-06, + "loss": 0.1333, + "step": 679 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 3.659149408340454, + "learning_rate": 3.147823281169498e-06, + "loss": 0.1501, + "step": 680 + }, + { + "epoch": 4.177914110429448, + "grad_norm": 3.0953338146209717, + "learning_rate": 3.143168270794612e-06, + "loss": 0.1067, + "step": 681 + }, + { + "epoch": 4.184049079754601, + "grad_norm": 3.5693907737731934, + "learning_rate": 3.1385108712389394e-06, + "loss": 0.2499, + "step": 682 + }, + { + "epoch": 4.190184049079755, + "grad_norm": 3.3022868633270264, + "learning_rate": 3.1338510998033484e-06, + "loss": 0.1748, + "step": 683 + }, + { + "epoch": 4.196319018404908, + "grad_norm": 3.7468113899230957, + "learning_rate": 3.129188973797519e-06, + "loss": 0.201, + "step": 684 + }, + { + "epoch": 4.2024539877300615, + "grad_norm": 2.8381078243255615, + "learning_rate": 3.124524510539875e-06, + "loss": 0.0735, + "step": 685 + }, + { + "epoch": 4.208588957055214, + "grad_norm": 2.84706974029541, + "learning_rate": 3.119857727357527e-06, + "loss": 0.1806, + "step": 686 + }, + { + "epoch": 4.214723926380368, + "grad_norm": 3.8130292892456055, + "learning_rate": 3.1151886415861993e-06, + "loss": 0.1811, + "step": 687 + }, + { + "epoch": 4.220858895705521, + "grad_norm": 3.528895378112793, + "learning_rate": 3.1105172705701708e-06, + "loss": 0.1634, + "step": 688 + }, + { + "epoch": 4.226993865030675, + "grad_norm": 5.028727054595947, + "learning_rate": 3.1058436316622103e-06, + "loss": 0.1625, + "step": 689 + }, + { + "epoch": 4.233128834355828, + "grad_norm": 4.606889247894287, + "learning_rate": 3.1011677422235093e-06, + "loss": 0.1791, + "step": 690 + }, + { + "epoch": 4.2392638036809815, + "grad_norm": 3.3620636463165283, + "learning_rate": 3.0964896196236217e-06, + "loss": 0.2233, + "step": 691 + }, + { + "epoch": 4.245398773006135, + "grad_norm": 3.7845852375030518, + "learning_rate": 3.0918092812403954e-06, + "loss": 0.1142, + "step": 692 + }, + { + "epoch": 4.251533742331288, + "grad_norm": 3.1204118728637695, + "learning_rate": 3.0871267444599098e-06, + "loss": 0.096, + "step": 693 + }, + { + "epoch": 4.257668711656442, + "grad_norm": 3.686067819595337, + "learning_rate": 3.0824420266764093e-06, + "loss": 0.2749, + "step": 694 + }, + { + "epoch": 4.263803680981595, + "grad_norm": 3.1680829524993896, + "learning_rate": 3.077755145292243e-06, + "loss": 0.2504, + "step": 695 + }, + { + "epoch": 4.269938650306749, + "grad_norm": 3.3179469108581543, + "learning_rate": 3.0730661177177957e-06, + "loss": 0.1324, + "step": 696 + }, + { + "epoch": 4.276073619631902, + "grad_norm": 3.1186370849609375, + "learning_rate": 3.0683749613714238e-06, + "loss": 0.0691, + "step": 697 + }, + { + "epoch": 4.282208588957055, + "grad_norm": 3.086834192276001, + "learning_rate": 3.063681693679391e-06, + "loss": 0.1026, + "step": 698 + }, + { + "epoch": 4.288343558282208, + "grad_norm": 4.629584312438965, + "learning_rate": 3.0589863320758063e-06, + "loss": 0.2646, + "step": 699 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 3.9641213417053223, + "learning_rate": 3.0542888940025562e-06, + "loss": 0.1711, + "step": 700 + }, + { + "epoch": 4.300613496932515, + "grad_norm": 3.75014328956604, + "learning_rate": 3.0495893969092395e-06, + "loss": 0.0589, + "step": 701 + }, + { + "epoch": 4.306748466257669, + "grad_norm": 3.603290319442749, + "learning_rate": 3.044887858253105e-06, + "loss": 0.2244, + "step": 702 + }, + { + "epoch": 4.3128834355828225, + "grad_norm": 3.79404616355896, + "learning_rate": 3.040184295498984e-06, + "loss": 0.1506, + "step": 703 + }, + { + "epoch": 4.319018404907975, + "grad_norm": 3.0890021324157715, + "learning_rate": 3.035478726119228e-06, + "loss": 0.2343, + "step": 704 + }, + { + "epoch": 4.325153374233129, + "grad_norm": 3.6688191890716553, + "learning_rate": 3.0307711675936426e-06, + "loss": 0.0518, + "step": 705 + }, + { + "epoch": 4.331288343558282, + "grad_norm": 5.1836700439453125, + "learning_rate": 3.0260616374094208e-06, + "loss": 0.2363, + "step": 706 + }, + { + "epoch": 4.337423312883436, + "grad_norm": 2.7123284339904785, + "learning_rate": 3.0213501530610807e-06, + "loss": 0.0848, + "step": 707 + }, + { + "epoch": 4.343558282208589, + "grad_norm": 3.5661890506744385, + "learning_rate": 3.0166367320504005e-06, + "loss": 0.149, + "step": 708 + }, + { + "epoch": 4.3496932515337425, + "grad_norm": 3.6454737186431885, + "learning_rate": 3.0119213918863515e-06, + "loss": 0.1133, + "step": 709 + }, + { + "epoch": 4.355828220858895, + "grad_norm": 3.7534968852996826, + "learning_rate": 3.0072041500850343e-06, + "loss": 0.1358, + "step": 710 + }, + { + "epoch": 4.361963190184049, + "grad_norm": 3.40387225151062, + "learning_rate": 3.0024850241696128e-06, + "loss": 0.0706, + "step": 711 + }, + { + "epoch": 4.368098159509202, + "grad_norm": 3.250471591949463, + "learning_rate": 2.9977640316702512e-06, + "loss": 0.1977, + "step": 712 + }, + { + "epoch": 4.374233128834356, + "grad_norm": 3.417781352996826, + "learning_rate": 2.993041190124047e-06, + "loss": 0.2622, + "step": 713 + }, + { + "epoch": 4.38036809815951, + "grad_norm": 2.628434181213379, + "learning_rate": 2.9883165170749657e-06, + "loss": 0.1487, + "step": 714 + }, + { + "epoch": 4.386503067484663, + "grad_norm": 3.240264892578125, + "learning_rate": 2.9835900300737763e-06, + "loss": 0.0822, + "step": 715 + }, + { + "epoch": 4.392638036809816, + "grad_norm": 6.575517177581787, + "learning_rate": 2.9788617466779884e-06, + "loss": 0.3668, + "step": 716 + }, + { + "epoch": 4.398773006134969, + "grad_norm": 4.699089050292969, + "learning_rate": 2.974131684451781e-06, + "loss": 0.2432, + "step": 717 + }, + { + "epoch": 4.404907975460123, + "grad_norm": 2.9815752506256104, + "learning_rate": 2.9693998609659443e-06, + "loss": 0.0689, + "step": 718 + }, + { + "epoch": 4.411042944785276, + "grad_norm": 4.192755222320557, + "learning_rate": 2.9646662937978082e-06, + "loss": 0.1897, + "step": 719 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 2.9729068279266357, + "learning_rate": 2.9599310005311824e-06, + "loss": 0.0457, + "step": 720 + }, + { + "epoch": 4.423312883435583, + "grad_norm": 4.234438896179199, + "learning_rate": 2.9551939987562866e-06, + "loss": 0.2307, + "step": 721 + }, + { + "epoch": 4.429447852760736, + "grad_norm": 3.3982434272766113, + "learning_rate": 2.950455306069688e-06, + "loss": 0.0637, + "step": 722 + }, + { + "epoch": 4.435582822085889, + "grad_norm": 4.539764404296875, + "learning_rate": 2.9457149400742357e-06, + "loss": 0.1924, + "step": 723 + }, + { + "epoch": 4.441717791411043, + "grad_norm": 4.039684772491455, + "learning_rate": 2.940972918378993e-06, + "loss": 0.1275, + "step": 724 + }, + { + "epoch": 4.447852760736196, + "grad_norm": 4.340360641479492, + "learning_rate": 2.936229258599174e-06, + "loss": 0.123, + "step": 725 + }, + { + "epoch": 4.45398773006135, + "grad_norm": 2.8720109462738037, + "learning_rate": 2.93148397835608e-06, + "loss": 0.0555, + "step": 726 + }, + { + "epoch": 4.460122699386503, + "grad_norm": 4.227811336517334, + "learning_rate": 2.926737095277029e-06, + "loss": 0.0991, + "step": 727 + }, + { + "epoch": 4.466257668711656, + "grad_norm": 2.8079142570495605, + "learning_rate": 2.921988626995295e-06, + "loss": 0.0628, + "step": 728 + }, + { + "epoch": 4.47239263803681, + "grad_norm": 4.195122241973877, + "learning_rate": 2.9172385911500385e-06, + "loss": 0.2333, + "step": 729 + }, + { + "epoch": 4.478527607361963, + "grad_norm": 3.223794460296631, + "learning_rate": 2.9124870053862447e-06, + "loss": 0.1317, + "step": 730 + }, + { + "epoch": 4.484662576687117, + "grad_norm": 3.5533759593963623, + "learning_rate": 2.907733887354657e-06, + "loss": 0.2285, + "step": 731 + }, + { + "epoch": 4.49079754601227, + "grad_norm": 3.535673141479492, + "learning_rate": 2.9029792547117088e-06, + "loss": 0.096, + "step": 732 + }, + { + "epoch": 4.4969325153374236, + "grad_norm": 4.031703948974609, + "learning_rate": 2.898223125119461e-06, + "loss": 0.1505, + "step": 733 + }, + { + "epoch": 4.5030674846625764, + "grad_norm": 2.823413610458374, + "learning_rate": 2.893465516245534e-06, + "loss": 0.0327, + "step": 734 + }, + { + "epoch": 4.50920245398773, + "grad_norm": 3.516738176345825, + "learning_rate": 2.8887064457630453e-06, + "loss": 0.0743, + "step": 735 + }, + { + "epoch": 4.515337423312883, + "grad_norm": 3.5523500442504883, + "learning_rate": 2.8839459313505407e-06, + "loss": 0.1768, + "step": 736 + }, + { + "epoch": 4.521472392638037, + "grad_norm": 3.2433223724365234, + "learning_rate": 2.879183990691929e-06, + "loss": 0.1598, + "step": 737 + }, + { + "epoch": 4.52760736196319, + "grad_norm": 3.0156848430633545, + "learning_rate": 2.8744206414764185e-06, + "loss": 0.0829, + "step": 738 + }, + { + "epoch": 4.533742331288344, + "grad_norm": 4.359529495239258, + "learning_rate": 2.8696559013984488e-06, + "loss": 0.1169, + "step": 739 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 2.3862433433532715, + "learning_rate": 2.8648897881576274e-06, + "loss": 0.0962, + "step": 740 + }, + { + "epoch": 4.54601226993865, + "grad_norm": 2.7100136280059814, + "learning_rate": 2.8601223194586613e-06, + "loss": 0.1204, + "step": 741 + }, + { + "epoch": 4.552147239263804, + "grad_norm": 3.8116140365600586, + "learning_rate": 2.8553535130112935e-06, + "loss": 0.0685, + "step": 742 + }, + { + "epoch": 4.558282208588957, + "grad_norm": 2.9640142917633057, + "learning_rate": 2.850583386530235e-06, + "loss": 0.0692, + "step": 743 + }, + { + "epoch": 4.564417177914111, + "grad_norm": 3.264592170715332, + "learning_rate": 2.8458119577351035e-06, + "loss": 0.2128, + "step": 744 + }, + { + "epoch": 4.570552147239264, + "grad_norm": 3.230497360229492, + "learning_rate": 2.841039244350351e-06, + "loss": 0.2409, + "step": 745 + }, + { + "epoch": 4.576687116564417, + "grad_norm": 4.41513204574585, + "learning_rate": 2.8362652641052024e-06, + "loss": 0.1878, + "step": 746 + }, + { + "epoch": 4.58282208588957, + "grad_norm": 3.047248601913452, + "learning_rate": 2.83149003473359e-06, + "loss": 0.1303, + "step": 747 + }, + { + "epoch": 4.588957055214724, + "grad_norm": 2.399754047393799, + "learning_rate": 2.8267135739740836e-06, + "loss": 0.0577, + "step": 748 + }, + { + "epoch": 4.595092024539877, + "grad_norm": 4.608038425445557, + "learning_rate": 2.8219358995698307e-06, + "loss": 0.2329, + "step": 749 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 3.537644147872925, + "learning_rate": 2.8171570292684846e-06, + "loss": 0.1329, + "step": 750 + }, + { + "epoch": 4.6073619631901845, + "grad_norm": 2.8099827766418457, + "learning_rate": 2.8123769808221407e-06, + "loss": 0.1512, + "step": 751 + }, + { + "epoch": 4.613496932515337, + "grad_norm": 3.3169758319854736, + "learning_rate": 2.8075957719872724e-06, + "loss": 0.1267, + "step": 752 + }, + { + "epoch": 4.61963190184049, + "grad_norm": 3.578435182571411, + "learning_rate": 2.8028134205246633e-06, + "loss": 0.147, + "step": 753 + }, + { + "epoch": 4.625766871165644, + "grad_norm": 3.544437885284424, + "learning_rate": 2.7980299441993415e-06, + "loss": 0.0947, + "step": 754 + }, + { + "epoch": 4.631901840490798, + "grad_norm": 3.798776388168335, + "learning_rate": 2.793245360780512e-06, + "loss": 0.1498, + "step": 755 + }, + { + "epoch": 4.638036809815951, + "grad_norm": 3.634991407394409, + "learning_rate": 2.788459688041495e-06, + "loss": 0.2504, + "step": 756 + }, + { + "epoch": 4.644171779141105, + "grad_norm": 20.123680114746094, + "learning_rate": 2.783672943759655e-06, + "loss": 0.2091, + "step": 757 + }, + { + "epoch": 4.6503067484662575, + "grad_norm": 3.9357221126556396, + "learning_rate": 2.778885145716339e-06, + "loss": 0.2045, + "step": 758 + }, + { + "epoch": 4.656441717791411, + "grad_norm": 3.3035309314727783, + "learning_rate": 2.7740963116968063e-06, + "loss": 0.1416, + "step": 759 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 3.096985101699829, + "learning_rate": 2.7693064594901646e-06, + "loss": 0.0455, + "step": 760 + }, + { + "epoch": 4.668711656441718, + "grad_norm": 2.9855458736419678, + "learning_rate": 2.7645156068893075e-06, + "loss": 0.1496, + "step": 761 + }, + { + "epoch": 4.674846625766871, + "grad_norm": 3.9140093326568604, + "learning_rate": 2.759723771690839e-06, + "loss": 0.2061, + "step": 762 + }, + { + "epoch": 4.680981595092025, + "grad_norm": 3.590569496154785, + "learning_rate": 2.754930971695019e-06, + "loss": 0.1017, + "step": 763 + }, + { + "epoch": 4.6871165644171775, + "grad_norm": 3.527254581451416, + "learning_rate": 2.750137224705687e-06, + "loss": 0.1979, + "step": 764 + }, + { + "epoch": 4.693251533742331, + "grad_norm": 4.198459148406982, + "learning_rate": 2.745342548530202e-06, + "loss": 0.1667, + "step": 765 + }, + { + "epoch": 4.699386503067485, + "grad_norm": 2.0246167182922363, + "learning_rate": 2.7405469609793746e-06, + "loss": 0.0346, + "step": 766 + }, + { + "epoch": 4.705521472392638, + "grad_norm": 3.2045300006866455, + "learning_rate": 2.7357504798674004e-06, + "loss": 0.0596, + "step": 767 + }, + { + "epoch": 4.711656441717792, + "grad_norm": 2.736985921859741, + "learning_rate": 2.730953123011796e-06, + "loss": 0.0384, + "step": 768 + }, + { + "epoch": 4.717791411042945, + "grad_norm": 3.0621395111083984, + "learning_rate": 2.726154908233328e-06, + "loss": 0.0558, + "step": 769 + }, + { + "epoch": 4.723926380368098, + "grad_norm": 3.2280497550964355, + "learning_rate": 2.721355853355953e-06, + "loss": 0.2272, + "step": 770 + }, + { + "epoch": 4.730061349693251, + "grad_norm": 3.342226028442383, + "learning_rate": 2.716555976206748e-06, + "loss": 0.074, + "step": 771 + }, + { + "epoch": 4.736196319018405, + "grad_norm": 4.328624248504639, + "learning_rate": 2.7117552946158415e-06, + "loss": 0.1034, + "step": 772 + }, + { + "epoch": 4.742331288343558, + "grad_norm": 2.980215311050415, + "learning_rate": 2.706953826416353e-06, + "loss": 0.1199, + "step": 773 + }, + { + "epoch": 4.748466257668712, + "grad_norm": 2.622478485107422, + "learning_rate": 2.702151589444324e-06, + "loss": 0.0467, + "step": 774 + }, + { + "epoch": 4.754601226993865, + "grad_norm": 2.9958693981170654, + "learning_rate": 2.6973486015386507e-06, + "loss": 0.143, + "step": 775 + }, + { + "epoch": 4.7607361963190185, + "grad_norm": 4.548511505126953, + "learning_rate": 2.6925448805410197e-06, + "loss": 0.3594, + "step": 776 + }, + { + "epoch": 4.766871165644172, + "grad_norm": 3.3429481983184814, + "learning_rate": 2.6877404442958393e-06, + "loss": 0.1397, + "step": 777 + }, + { + "epoch": 4.773006134969325, + "grad_norm": 2.5820136070251465, + "learning_rate": 2.682935310650177e-06, + "loss": 0.054, + "step": 778 + }, + { + "epoch": 4.779141104294479, + "grad_norm": 4.047626495361328, + "learning_rate": 2.6781294974536886e-06, + "loss": 0.1284, + "step": 779 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 3.0227510929107666, + "learning_rate": 2.673323022558557e-06, + "loss": 0.1441, + "step": 780 + }, + { + "epoch": 4.791411042944786, + "grad_norm": 4.731313705444336, + "learning_rate": 2.6685159038194202e-06, + "loss": 0.2859, + "step": 781 + }, + { + "epoch": 4.7975460122699385, + "grad_norm": 3.880655288696289, + "learning_rate": 2.6637081590933096e-06, + "loss": 0.1524, + "step": 782 + }, + { + "epoch": 4.803680981595092, + "grad_norm": 2.375474452972412, + "learning_rate": 2.6588998062395803e-06, + "loss": 0.0338, + "step": 783 + }, + { + "epoch": 4.809815950920245, + "grad_norm": 3.3587446212768555, + "learning_rate": 2.6540908631198498e-06, + "loss": 0.0755, + "step": 784 + }, + { + "epoch": 4.815950920245399, + "grad_norm": 2.767686367034912, + "learning_rate": 2.6492813475979243e-06, + "loss": 0.0631, + "step": 785 + }, + { + "epoch": 4.822085889570552, + "grad_norm": 3.88670015335083, + "learning_rate": 2.6444712775397397e-06, + "loss": 0.0853, + "step": 786 + }, + { + "epoch": 4.828220858895706, + "grad_norm": 3.543276309967041, + "learning_rate": 2.639660670813288e-06, + "loss": 0.1895, + "step": 787 + }, + { + "epoch": 4.8343558282208585, + "grad_norm": 3.659323215484619, + "learning_rate": 2.6348495452885598e-06, + "loss": 0.1745, + "step": 788 + }, + { + "epoch": 4.840490797546012, + "grad_norm": 3.0955021381378174, + "learning_rate": 2.630037918837468e-06, + "loss": 0.0846, + "step": 789 + }, + { + "epoch": 4.846625766871165, + "grad_norm": 3.4473249912261963, + "learning_rate": 2.6252258093337892e-06, + "loss": 0.0808, + "step": 790 + }, + { + "epoch": 4.852760736196319, + "grad_norm": 3.937120199203491, + "learning_rate": 2.6204132346530936e-06, + "loss": 0.2054, + "step": 791 + }, + { + "epoch": 4.858895705521473, + "grad_norm": 4.052806854248047, + "learning_rate": 2.6156002126726788e-06, + "loss": 0.1679, + "step": 792 + }, + { + "epoch": 4.865030674846626, + "grad_norm": 2.6694889068603516, + "learning_rate": 2.6107867612715043e-06, + "loss": 0.0534, + "step": 793 + }, + { + "epoch": 4.871165644171779, + "grad_norm": 3.594649076461792, + "learning_rate": 2.6059728983301267e-06, + "loss": 0.0899, + "step": 794 + }, + { + "epoch": 4.877300613496932, + "grad_norm": 2.7796030044555664, + "learning_rate": 2.601158641730629e-06, + "loss": 0.0596, + "step": 795 + }, + { + "epoch": 4.883435582822086, + "grad_norm": 4.618961334228516, + "learning_rate": 2.5963440093565567e-06, + "loss": 0.3858, + "step": 796 + }, + { + "epoch": 4.889570552147239, + "grad_norm": 3.0783939361572266, + "learning_rate": 2.5915290190928518e-06, + "loss": 0.12, + "step": 797 + }, + { + "epoch": 4.895705521472393, + "grad_norm": 4.078456878662109, + "learning_rate": 2.586713688825786e-06, + "loss": 0.1278, + "step": 798 + }, + { + "epoch": 4.901840490797546, + "grad_norm": 2.9439120292663574, + "learning_rate": 2.5818980364428935e-06, + "loss": 0.0847, + "step": 799 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 5.140681743621826, + "learning_rate": 2.5770820798329055e-06, + "loss": 0.1718, + "step": 800 + }, + { + "epoch": 4.914110429447852, + "grad_norm": 3.450190305709839, + "learning_rate": 2.572265836885682e-06, + "loss": 0.0895, + "step": 801 + }, + { + "epoch": 4.920245398773006, + "grad_norm": 3.1145224571228027, + "learning_rate": 2.567449325492149e-06, + "loss": 0.0652, + "step": 802 + }, + { + "epoch": 4.92638036809816, + "grad_norm": 2.851768732070923, + "learning_rate": 2.5626325635442283e-06, + "loss": 0.0877, + "step": 803 + }, + { + "epoch": 4.932515337423313, + "grad_norm": 3.3392980098724365, + "learning_rate": 2.5578155689347716e-06, + "loss": 0.2028, + "step": 804 + }, + { + "epoch": 4.938650306748467, + "grad_norm": 3.012439250946045, + "learning_rate": 2.5529983595574964e-06, + "loss": 0.031, + "step": 805 + }, + { + "epoch": 4.9447852760736195, + "grad_norm": 2.7732717990875244, + "learning_rate": 2.548180953306918e-06, + "loss": 0.0415, + "step": 806 + }, + { + "epoch": 4.950920245398773, + "grad_norm": 3.0423903465270996, + "learning_rate": 2.5433633680782817e-06, + "loss": 0.1188, + "step": 807 + }, + { + "epoch": 4.957055214723926, + "grad_norm": 5.056387901306152, + "learning_rate": 2.538545621767498e-06, + "loss": 0.1703, + "step": 808 + }, + { + "epoch": 4.96319018404908, + "grad_norm": 4.052585124969482, + "learning_rate": 2.533727732271077e-06, + "loss": 0.1455, + "step": 809 + }, + { + "epoch": 4.969325153374233, + "grad_norm": 3.4507904052734375, + "learning_rate": 2.5289097174860593e-06, + "loss": 0.0617, + "step": 810 + }, + { + "epoch": 4.975460122699387, + "grad_norm": 2.908266305923462, + "learning_rate": 2.524091595309952e-06, + "loss": 0.1173, + "step": 811 + }, + { + "epoch": 4.9815950920245395, + "grad_norm": 2.5857458114624023, + "learning_rate": 2.519273383640661e-06, + "loss": 0.0538, + "step": 812 + }, + { + "epoch": 4.987730061349693, + "grad_norm": 3.3518428802490234, + "learning_rate": 2.5144551003764227e-06, + "loss": 0.211, + "step": 813 + }, + { + "epoch": 4.993865030674847, + "grad_norm": 3.137981653213501, + "learning_rate": 2.509636763415742e-06, + "loss": 0.0944, + "step": 814 + }, + { + "epoch": 5.0, + "grad_norm": 2.8854241371154785, + "learning_rate": 2.5048183906573227e-06, + "loss": 0.098, + "step": 815 + }, + { + "epoch": 5.006134969325154, + "grad_norm": 3.508527994155884, + "learning_rate": 2.5e-06, + "loss": 0.1102, + "step": 816 + }, + { + "epoch": 5.012269938650307, + "grad_norm": 2.448152542114258, + "learning_rate": 2.495181609342678e-06, + "loss": 0.0712, + "step": 817 + }, + { + "epoch": 5.0184049079754605, + "grad_norm": 3.105818748474121, + "learning_rate": 2.4903632365842587e-06, + "loss": 0.0414, + "step": 818 + }, + { + "epoch": 5.024539877300613, + "grad_norm": 3.8048601150512695, + "learning_rate": 2.4855448996235777e-06, + "loss": 0.0894, + "step": 819 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 3.259834051132202, + "learning_rate": 2.48072661635934e-06, + "loss": 0.0796, + "step": 820 + }, + { + "epoch": 5.03680981595092, + "grad_norm": 2.822364568710327, + "learning_rate": 2.475908404690049e-06, + "loss": 0.0349, + "step": 821 + }, + { + "epoch": 5.042944785276074, + "grad_norm": 4.78808069229126, + "learning_rate": 2.4710902825139415e-06, + "loss": 0.2529, + "step": 822 + }, + { + "epoch": 5.049079754601227, + "grad_norm": 3.5420572757720947, + "learning_rate": 2.466272267728924e-06, + "loss": 0.1405, + "step": 823 + }, + { + "epoch": 5.0552147239263805, + "grad_norm": 2.500713348388672, + "learning_rate": 2.461454378232503e-06, + "loss": 0.0408, + "step": 824 + }, + { + "epoch": 5.061349693251533, + "grad_norm": 3.266291618347168, + "learning_rate": 2.4566366319217196e-06, + "loss": 0.0338, + "step": 825 + }, + { + "epoch": 5.067484662576687, + "grad_norm": 4.071012020111084, + "learning_rate": 2.4518190466930837e-06, + "loss": 0.06, + "step": 826 + }, + { + "epoch": 5.07361963190184, + "grad_norm": 4.3747172355651855, + "learning_rate": 2.4470016404425045e-06, + "loss": 0.1184, + "step": 827 + }, + { + "epoch": 5.079754601226994, + "grad_norm": 3.92030668258667, + "learning_rate": 2.4421844310652296e-06, + "loss": 0.1369, + "step": 828 + }, + { + "epoch": 5.085889570552148, + "grad_norm": 3.3482303619384766, + "learning_rate": 2.437367436455773e-06, + "loss": 0.1166, + "step": 829 + }, + { + "epoch": 5.0920245398773005, + "grad_norm": 3.429368019104004, + "learning_rate": 2.4325506745078524e-06, + "loss": 0.1214, + "step": 830 + }, + { + "epoch": 5.098159509202454, + "grad_norm": 3.4915647506713867, + "learning_rate": 2.427734163114319e-06, + "loss": 0.0454, + "step": 831 + }, + { + "epoch": 5.104294478527607, + "grad_norm": 3.1721251010894775, + "learning_rate": 2.4229179201670954e-06, + "loss": 0.0431, + "step": 832 + }, + { + "epoch": 5.110429447852761, + "grad_norm": 2.552578926086426, + "learning_rate": 2.418101963557107e-06, + "loss": 0.0347, + "step": 833 + }, + { + "epoch": 5.116564417177914, + "grad_norm": 3.518169403076172, + "learning_rate": 2.413286311174214e-06, + "loss": 0.1555, + "step": 834 + }, + { + "epoch": 5.122699386503068, + "grad_norm": 2.4452908039093018, + "learning_rate": 2.4084709809071487e-06, + "loss": 0.035, + "step": 835 + }, + { + "epoch": 5.128834355828221, + "grad_norm": 3.5366528034210205, + "learning_rate": 2.403655990643444e-06, + "loss": 0.0798, + "step": 836 + }, + { + "epoch": 5.134969325153374, + "grad_norm": 2.300065040588379, + "learning_rate": 2.398841358269371e-06, + "loss": 0.0178, + "step": 837 + }, + { + "epoch": 5.141104294478527, + "grad_norm": 2.851393699645996, + "learning_rate": 2.3940271016698733e-06, + "loss": 0.0447, + "step": 838 + }, + { + "epoch": 5.147239263803681, + "grad_norm": 4.085958957672119, + "learning_rate": 2.3892132387284956e-06, + "loss": 0.1626, + "step": 839 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 3.4240522384643555, + "learning_rate": 2.384399787327322e-06, + "loss": 0.0914, + "step": 840 + }, + { + "epoch": 5.159509202453988, + "grad_norm": 4.111586570739746, + "learning_rate": 2.3795867653469072e-06, + "loss": 0.0784, + "step": 841 + }, + { + "epoch": 5.1656441717791415, + "grad_norm": 2.3306312561035156, + "learning_rate": 2.374774190666211e-06, + "loss": 0.0216, + "step": 842 + }, + { + "epoch": 5.171779141104294, + "grad_norm": 2.5006275177001953, + "learning_rate": 2.3699620811625327e-06, + "loss": 0.0516, + "step": 843 + }, + { + "epoch": 5.177914110429448, + "grad_norm": 3.1680967807769775, + "learning_rate": 2.365150454711441e-06, + "loss": 0.0517, + "step": 844 + }, + { + "epoch": 5.184049079754601, + "grad_norm": 1.817044734954834, + "learning_rate": 2.3603393291867122e-06, + "loss": 0.0264, + "step": 845 + }, + { + "epoch": 5.190184049079755, + "grad_norm": 4.445211887359619, + "learning_rate": 2.355528722460261e-06, + "loss": 0.1079, + "step": 846 + }, + { + "epoch": 5.196319018404908, + "grad_norm": 2.918304681777954, + "learning_rate": 2.350718652402076e-06, + "loss": 0.0633, + "step": 847 + }, + { + "epoch": 5.2024539877300615, + "grad_norm": 3.6307432651519775, + "learning_rate": 2.345909136880151e-06, + "loss": 0.1013, + "step": 848 + }, + { + "epoch": 5.208588957055214, + "grad_norm": 3.5696842670440674, + "learning_rate": 2.34110019376042e-06, + "loss": 0.0199, + "step": 849 + }, + { + "epoch": 5.214723926380368, + "grad_norm": 2.2214856147766113, + "learning_rate": 2.336291840906691e-06, + "loss": 0.0288, + "step": 850 + }, + { + "epoch": 5.220858895705521, + "grad_norm": 2.5375778675079346, + "learning_rate": 2.3314840961805806e-06, + "loss": 0.0142, + "step": 851 + }, + { + "epoch": 5.226993865030675, + "grad_norm": 3.0093517303466797, + "learning_rate": 2.326676977441444e-06, + "loss": 0.0911, + "step": 852 + }, + { + "epoch": 5.233128834355828, + "grad_norm": 2.7067151069641113, + "learning_rate": 2.3218705025463118e-06, + "loss": 0.0315, + "step": 853 + }, + { + "epoch": 5.2392638036809815, + "grad_norm": 3.1892940998077393, + "learning_rate": 2.3170646893498237e-06, + "loss": 0.1344, + "step": 854 + }, + { + "epoch": 5.245398773006135, + "grad_norm": 2.8909313678741455, + "learning_rate": 2.312259555704161e-06, + "loss": 0.034, + "step": 855 + }, + { + "epoch": 5.251533742331288, + "grad_norm": 5.097650051116943, + "learning_rate": 2.3074551194589816e-06, + "loss": 0.1889, + "step": 856 + }, + { + "epoch": 5.257668711656442, + "grad_norm": 3.8511006832122803, + "learning_rate": 2.3026513984613506e-06, + "loss": 0.0794, + "step": 857 + }, + { + "epoch": 5.263803680981595, + "grad_norm": 2.2874133586883545, + "learning_rate": 2.297848410555677e-06, + "loss": 0.0238, + "step": 858 + }, + { + "epoch": 5.269938650306749, + "grad_norm": 3.504723310470581, + "learning_rate": 2.293046173583648e-06, + "loss": 0.0369, + "step": 859 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 3.2108154296875, + "learning_rate": 2.28824470538416e-06, + "loss": 0.0677, + "step": 860 + }, + { + "epoch": 5.282208588957055, + "grad_norm": 2.2249386310577393, + "learning_rate": 2.2834440237932537e-06, + "loss": 0.0244, + "step": 861 + }, + { + "epoch": 5.288343558282208, + "grad_norm": 3.141784191131592, + "learning_rate": 2.2786441466440474e-06, + "loss": 0.0628, + "step": 862 + }, + { + "epoch": 5.294478527607362, + "grad_norm": 3.5597352981567383, + "learning_rate": 2.2738450917666727e-06, + "loss": 0.0914, + "step": 863 + }, + { + "epoch": 5.300613496932515, + "grad_norm": 2.991966962814331, + "learning_rate": 2.269046876988204e-06, + "loss": 0.0546, + "step": 864 + }, + { + "epoch": 5.306748466257669, + "grad_norm": 3.100776195526123, + "learning_rate": 2.2642495201325995e-06, + "loss": 0.0473, + "step": 865 + }, + { + "epoch": 5.3128834355828225, + "grad_norm": 2.541754722595215, + "learning_rate": 2.259453039020626e-06, + "loss": 0.0613, + "step": 866 + }, + { + "epoch": 5.319018404907975, + "grad_norm": 2.8117194175720215, + "learning_rate": 2.2546574514697985e-06, + "loss": 0.0533, + "step": 867 + }, + { + "epoch": 5.325153374233129, + "grad_norm": 2.5676379203796387, + "learning_rate": 2.249862775294313e-06, + "loss": 0.018, + "step": 868 + }, + { + "epoch": 5.331288343558282, + "grad_norm": 2.5297701358795166, + "learning_rate": 2.245069028304981e-06, + "loss": 0.0246, + "step": 869 + }, + { + "epoch": 5.337423312883436, + "grad_norm": 2.199498176574707, + "learning_rate": 2.240276228309161e-06, + "loss": 0.0551, + "step": 870 + }, + { + "epoch": 5.343558282208589, + "grad_norm": 2.5793557167053223, + "learning_rate": 2.2354843931106933e-06, + "loss": 0.0258, + "step": 871 + }, + { + "epoch": 5.3496932515337425, + "grad_norm": 3.352058172225952, + "learning_rate": 2.230693540509836e-06, + "loss": 0.0228, + "step": 872 + }, + { + "epoch": 5.355828220858895, + "grad_norm": 2.900599956512451, + "learning_rate": 2.225903688303195e-06, + "loss": 0.0586, + "step": 873 + }, + { + "epoch": 5.361963190184049, + "grad_norm": 3.3317267894744873, + "learning_rate": 2.221114854283662e-06, + "loss": 0.0733, + "step": 874 + }, + { + "epoch": 5.368098159509202, + "grad_norm": 2.79304575920105, + "learning_rate": 2.2163270562403453e-06, + "loss": 0.0251, + "step": 875 + }, + { + "epoch": 5.374233128834356, + "grad_norm": 3.8596227169036865, + "learning_rate": 2.211540311958506e-06, + "loss": 0.0957, + "step": 876 + }, + { + "epoch": 5.38036809815951, + "grad_norm": 2.7464358806610107, + "learning_rate": 2.2067546392194888e-06, + "loss": 0.0457, + "step": 877 + }, + { + "epoch": 5.386503067484663, + "grad_norm": 2.3359906673431396, + "learning_rate": 2.2019700558006598e-06, + "loss": 0.0218, + "step": 878 + }, + { + "epoch": 5.392638036809816, + "grad_norm": 3.2412452697753906, + "learning_rate": 2.197186579475337e-06, + "loss": 0.0494, + "step": 879 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 3.930197238922119, + "learning_rate": 2.1924042280127284e-06, + "loss": 0.0803, + "step": 880 + }, + { + "epoch": 5.404907975460123, + "grad_norm": 2.5752930641174316, + "learning_rate": 2.1876230191778598e-06, + "loss": 0.0356, + "step": 881 + }, + { + "epoch": 5.411042944785276, + "grad_norm": 5.507393836975098, + "learning_rate": 2.182842970731516e-06, + "loss": 0.1245, + "step": 882 + }, + { + "epoch": 5.41717791411043, + "grad_norm": 2.416719436645508, + "learning_rate": 2.17806410043017e-06, + "loss": 0.0224, + "step": 883 + }, + { + "epoch": 5.423312883435583, + "grad_norm": 2.500429630279541, + "learning_rate": 2.173286426025917e-06, + "loss": 0.0499, + "step": 884 + }, + { + "epoch": 5.429447852760736, + "grad_norm": 2.8843860626220703, + "learning_rate": 2.168509965266411e-06, + "loss": 0.075, + "step": 885 + }, + { + "epoch": 5.435582822085889, + "grad_norm": 2.3187198638916016, + "learning_rate": 2.1637347358947984e-06, + "loss": 0.065, + "step": 886 + }, + { + "epoch": 5.441717791411043, + "grad_norm": 2.7135889530181885, + "learning_rate": 2.15896075564965e-06, + "loss": 0.0848, + "step": 887 + }, + { + "epoch": 5.447852760736196, + "grad_norm": 1.751846194267273, + "learning_rate": 2.1541880422648978e-06, + "loss": 0.0112, + "step": 888 + }, + { + "epoch": 5.45398773006135, + "grad_norm": 3.113271713256836, + "learning_rate": 2.1494166134697655e-06, + "loss": 0.077, + "step": 889 + }, + { + "epoch": 5.460122699386503, + "grad_norm": 2.711318016052246, + "learning_rate": 2.1446464869887077e-06, + "loss": 0.03, + "step": 890 + }, + { + "epoch": 5.466257668711656, + "grad_norm": 1.8012003898620605, + "learning_rate": 2.13987768054134e-06, + "loss": 0.0141, + "step": 891 + }, + { + "epoch": 5.47239263803681, + "grad_norm": 2.0968120098114014, + "learning_rate": 2.135110211842374e-06, + "loss": 0.0147, + "step": 892 + }, + { + "epoch": 5.478527607361963, + "grad_norm": 3.1689956188201904, + "learning_rate": 2.1303440986015525e-06, + "loss": 0.1123, + "step": 893 + }, + { + "epoch": 5.484662576687117, + "grad_norm": 4.512697219848633, + "learning_rate": 2.1255793585235827e-06, + "loss": 0.0359, + "step": 894 + }, + { + "epoch": 5.49079754601227, + "grad_norm": 3.5739688873291016, + "learning_rate": 2.120816009308071e-06, + "loss": 0.0635, + "step": 895 + }, + { + "epoch": 5.4969325153374236, + "grad_norm": 4.556554317474365, + "learning_rate": 2.1160540686494597e-06, + "loss": 0.1104, + "step": 896 + }, + { + "epoch": 5.5030674846625764, + "grad_norm": 2.2047064304351807, + "learning_rate": 2.1112935542369546e-06, + "loss": 0.0187, + "step": 897 + }, + { + "epoch": 5.50920245398773, + "grad_norm": 3.0289857387542725, + "learning_rate": 2.106534483754466e-06, + "loss": 0.0874, + "step": 898 + }, + { + "epoch": 5.515337423312883, + "grad_norm": 2.7090444564819336, + "learning_rate": 2.1017768748805396e-06, + "loss": 0.0301, + "step": 899 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 3.0662643909454346, + "learning_rate": 2.0970207452882917e-06, + "loss": 0.1192, + "step": 900 + }, + { + "epoch": 5.52760736196319, + "grad_norm": 2.869401454925537, + "learning_rate": 2.0922661126453436e-06, + "loss": 0.0803, + "step": 901 + }, + { + "epoch": 5.533742331288344, + "grad_norm": 2.229947328567505, + "learning_rate": 2.0875129946137557e-06, + "loss": 0.0186, + "step": 902 + }, + { + "epoch": 5.539877300613497, + "grad_norm": 3.3460421562194824, + "learning_rate": 2.0827614088499624e-06, + "loss": 0.0499, + "step": 903 + }, + { + "epoch": 5.54601226993865, + "grad_norm": 1.9324007034301758, + "learning_rate": 2.0780113730047056e-06, + "loss": 0.0322, + "step": 904 + }, + { + "epoch": 5.552147239263804, + "grad_norm": 2.761482000350952, + "learning_rate": 2.0732629047229712e-06, + "loss": 0.0265, + "step": 905 + }, + { + "epoch": 5.558282208588957, + "grad_norm": 2.4173266887664795, + "learning_rate": 2.0685160216439205e-06, + "loss": 0.0229, + "step": 906 + }, + { + "epoch": 5.564417177914111, + "grad_norm": 2.503661632537842, + "learning_rate": 2.0637707414008267e-06, + "loss": 0.0266, + "step": 907 + }, + { + "epoch": 5.570552147239264, + "grad_norm": 2.312236785888672, + "learning_rate": 2.0590270816210077e-06, + "loss": 0.018, + "step": 908 + }, + { + "epoch": 5.576687116564417, + "grad_norm": 2.569575548171997, + "learning_rate": 2.0542850599257647e-06, + "loss": 0.0377, + "step": 909 + }, + { + "epoch": 5.58282208588957, + "grad_norm": 3.520341157913208, + "learning_rate": 2.0495446939303122e-06, + "loss": 0.1224, + "step": 910 + }, + { + "epoch": 5.588957055214724, + "grad_norm": 3.231363296508789, + "learning_rate": 2.044806001243714e-06, + "loss": 0.1457, + "step": 911 + }, + { + "epoch": 5.595092024539877, + "grad_norm": 3.3211300373077393, + "learning_rate": 2.040068999468818e-06, + "loss": 0.0429, + "step": 912 + }, + { + "epoch": 5.601226993865031, + "grad_norm": 3.3712961673736572, + "learning_rate": 2.035333706202192e-06, + "loss": 0.0634, + "step": 913 + }, + { + "epoch": 5.6073619631901845, + "grad_norm": 2.480177402496338, + "learning_rate": 2.0306001390340565e-06, + "loss": 0.0178, + "step": 914 + }, + { + "epoch": 5.613496932515337, + "grad_norm": 2.9777421951293945, + "learning_rate": 2.02586831554822e-06, + "loss": 0.037, + "step": 915 + }, + { + "epoch": 5.61963190184049, + "grad_norm": 2.9129085540771484, + "learning_rate": 2.021138253322012e-06, + "loss": 0.125, + "step": 916 + }, + { + "epoch": 5.625766871165644, + "grad_norm": 4.041767597198486, + "learning_rate": 2.016409969926224e-06, + "loss": 0.1897, + "step": 917 + }, + { + "epoch": 5.631901840490798, + "grad_norm": 4.088902950286865, + "learning_rate": 2.0116834829250355e-06, + "loss": 0.0546, + "step": 918 + }, + { + "epoch": 5.638036809815951, + "grad_norm": 3.8629167079925537, + "learning_rate": 2.0069588098759545e-06, + "loss": 0.0911, + "step": 919 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 2.616830825805664, + "learning_rate": 2.00223596832975e-06, + "loss": 0.0527, + "step": 920 + }, + { + "epoch": 5.6503067484662575, + "grad_norm": 1.9370782375335693, + "learning_rate": 1.9975149758303885e-06, + "loss": 0.0384, + "step": 921 + }, + { + "epoch": 5.656441717791411, + "grad_norm": 3.7839455604553223, + "learning_rate": 1.992795849914967e-06, + "loss": 0.1033, + "step": 922 + }, + { + "epoch": 5.662576687116564, + "grad_norm": 3.870729923248291, + "learning_rate": 1.9880786081136498e-06, + "loss": 0.08, + "step": 923 + }, + { + "epoch": 5.668711656441718, + "grad_norm": 3.4394288063049316, + "learning_rate": 1.9833632679496008e-06, + "loss": 0.0819, + "step": 924 + }, + { + "epoch": 5.674846625766871, + "grad_norm": 3.1659159660339355, + "learning_rate": 1.97864984693892e-06, + "loss": 0.117, + "step": 925 + }, + { + "epoch": 5.680981595092025, + "grad_norm": 2.2375190258026123, + "learning_rate": 1.97393836259058e-06, + "loss": 0.0215, + "step": 926 + }, + { + "epoch": 5.6871165644171775, + "grad_norm": 3.9375314712524414, + "learning_rate": 1.969228832406358e-06, + "loss": 0.1422, + "step": 927 + }, + { + "epoch": 5.693251533742331, + "grad_norm": 3.1969058513641357, + "learning_rate": 1.964521273880772e-06, + "loss": 0.0538, + "step": 928 + }, + { + "epoch": 5.699386503067485, + "grad_norm": 3.5990066528320312, + "learning_rate": 1.9598157045010162e-06, + "loss": 0.114, + "step": 929 + }, + { + "epoch": 5.705521472392638, + "grad_norm": 3.1764235496520996, + "learning_rate": 1.9551121417468955e-06, + "loss": 0.053, + "step": 930 + }, + { + "epoch": 5.711656441717792, + "grad_norm": 4.1162309646606445, + "learning_rate": 1.9504106030907605e-06, + "loss": 0.0866, + "step": 931 + }, + { + "epoch": 5.717791411042945, + "grad_norm": 3.543071985244751, + "learning_rate": 1.945711105997444e-06, + "loss": 0.0908, + "step": 932 + }, + { + "epoch": 5.723926380368098, + "grad_norm": 4.136870384216309, + "learning_rate": 1.941013667924194e-06, + "loss": 0.0612, + "step": 933 + }, + { + "epoch": 5.730061349693251, + "grad_norm": 1.7658357620239258, + "learning_rate": 1.9363183063206097e-06, + "loss": 0.0283, + "step": 934 + }, + { + "epoch": 5.736196319018405, + "grad_norm": 3.9701411724090576, + "learning_rate": 1.931625038628577e-06, + "loss": 0.0948, + "step": 935 + }, + { + "epoch": 5.742331288343558, + "grad_norm": 3.0636157989501953, + "learning_rate": 1.9269338822822047e-06, + "loss": 0.0769, + "step": 936 + }, + { + "epoch": 5.748466257668712, + "grad_norm": 3.3671388626098633, + "learning_rate": 1.9222448547077573e-06, + "loss": 0.098, + "step": 937 + }, + { + "epoch": 5.754601226993865, + "grad_norm": 3.0725975036621094, + "learning_rate": 1.917557973323591e-06, + "loss": 0.0363, + "step": 938 + }, + { + "epoch": 5.7607361963190185, + "grad_norm": 2.5592041015625, + "learning_rate": 1.9128732555400915e-06, + "loss": 0.0205, + "step": 939 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 2.835740804672241, + "learning_rate": 1.9081907187596054e-06, + "loss": 0.0548, + "step": 940 + }, + { + "epoch": 5.773006134969325, + "grad_norm": 3.3596746921539307, + "learning_rate": 1.9035103803763793e-06, + "loss": 0.0454, + "step": 941 + }, + { + "epoch": 5.779141104294479, + "grad_norm": 3.226579427719116, + "learning_rate": 1.8988322577764918e-06, + "loss": 0.0514, + "step": 942 + }, + { + "epoch": 5.785276073619632, + "grad_norm": 3.2044687271118164, + "learning_rate": 1.8941563683377905e-06, + "loss": 0.1361, + "step": 943 + }, + { + "epoch": 5.791411042944786, + "grad_norm": 1.8300527334213257, + "learning_rate": 1.8894827294298296e-06, + "loss": 0.0139, + "step": 944 + }, + { + "epoch": 5.7975460122699385, + "grad_norm": 2.503735303878784, + "learning_rate": 1.884811358413801e-06, + "loss": 0.0311, + "step": 945 + }, + { + "epoch": 5.803680981595092, + "grad_norm": 2.171309471130371, + "learning_rate": 1.8801422726424735e-06, + "loss": 0.0227, + "step": 946 + }, + { + "epoch": 5.809815950920245, + "grad_norm": 1.8116636276245117, + "learning_rate": 1.8754754894601252e-06, + "loss": 0.0157, + "step": 947 + }, + { + "epoch": 5.815950920245399, + "grad_norm": 3.1412570476531982, + "learning_rate": 1.870811026202482e-06, + "loss": 0.1093, + "step": 948 + }, + { + "epoch": 5.822085889570552, + "grad_norm": 2.3962290287017822, + "learning_rate": 1.8661489001966526e-06, + "loss": 0.021, + "step": 949 + }, + { + "epoch": 5.828220858895706, + "grad_norm": 4.169166564941406, + "learning_rate": 1.8614891287610621e-06, + "loss": 0.0663, + "step": 950 + }, + { + "epoch": 5.8343558282208585, + "grad_norm": 3.1181528568267822, + "learning_rate": 1.8568317292053894e-06, + "loss": 0.1008, + "step": 951 + }, + { + "epoch": 5.840490797546012, + "grad_norm": 3.5155029296875, + "learning_rate": 1.8521767188305023e-06, + "loss": 0.0451, + "step": 952 + }, + { + "epoch": 5.846625766871165, + "grad_norm": 2.975693702697754, + "learning_rate": 1.8475241149283957e-06, + "loss": 0.0561, + "step": 953 + }, + { + "epoch": 5.852760736196319, + "grad_norm": 2.1581289768218994, + "learning_rate": 1.842873934782122e-06, + "loss": 0.0265, + "step": 954 + }, + { + "epoch": 5.858895705521473, + "grad_norm": 2.6281228065490723, + "learning_rate": 1.8382261956657318e-06, + "loss": 0.1196, + "step": 955 + }, + { + "epoch": 5.865030674846626, + "grad_norm": 2.9569528102874756, + "learning_rate": 1.8335809148442074e-06, + "loss": 0.1356, + "step": 956 + }, + { + "epoch": 5.871165644171779, + "grad_norm": 2.450949192047119, + "learning_rate": 1.8289381095734005e-06, + "loss": 0.0444, + "step": 957 + }, + { + "epoch": 5.877300613496932, + "grad_norm": 2.1737027168273926, + "learning_rate": 1.8242977970999643e-06, + "loss": 0.0622, + "step": 958 + }, + { + "epoch": 5.883435582822086, + "grad_norm": 3.350647211074829, + "learning_rate": 1.8196599946612956e-06, + "loss": 0.0762, + "step": 959 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 2.5031936168670654, + "learning_rate": 1.8150247194854642e-06, + "loss": 0.0207, + "step": 960 + }, + { + "epoch": 5.895705521472393, + "grad_norm": 3.7103707790374756, + "learning_rate": 1.8103919887911525e-06, + "loss": 0.1122, + "step": 961 + }, + { + "epoch": 5.901840490797546, + "grad_norm": 2.485322952270508, + "learning_rate": 1.8057618197875914e-06, + "loss": 0.0284, + "step": 962 + }, + { + "epoch": 5.9079754601226995, + "grad_norm": 1.903212547302246, + "learning_rate": 1.8011342296744961e-06, + "loss": 0.0239, + "step": 963 + }, + { + "epoch": 5.914110429447852, + "grad_norm": 3.015552520751953, + "learning_rate": 1.796509235642001e-06, + "loss": 0.0425, + "step": 964 + }, + { + "epoch": 5.920245398773006, + "grad_norm": 4.806198596954346, + "learning_rate": 1.7918868548705982e-06, + "loss": 0.2094, + "step": 965 + }, + { + "epoch": 5.92638036809816, + "grad_norm": 2.949596643447876, + "learning_rate": 1.7872671045310703e-06, + "loss": 0.0632, + "step": 966 + }, + { + "epoch": 5.932515337423313, + "grad_norm": 4.153099536895752, + "learning_rate": 1.782650001784431e-06, + "loss": 0.1411, + "step": 967 + }, + { + "epoch": 5.938650306748467, + "grad_norm": 3.4117565155029297, + "learning_rate": 1.7780355637818568e-06, + "loss": 0.0965, + "step": 968 + }, + { + "epoch": 5.9447852760736195, + "grad_norm": 2.533405303955078, + "learning_rate": 1.7734238076646277e-06, + "loss": 0.0568, + "step": 969 + }, + { + "epoch": 5.950920245398773, + "grad_norm": 2.3604726791381836, + "learning_rate": 1.7688147505640581e-06, + "loss": 0.0182, + "step": 970 + }, + { + "epoch": 5.957055214723926, + "grad_norm": 3.807424306869507, + "learning_rate": 1.7642084096014405e-06, + "loss": 0.0547, + "step": 971 + }, + { + "epoch": 5.96319018404908, + "grad_norm": 2.5735342502593994, + "learning_rate": 1.759604801887974e-06, + "loss": 0.0775, + "step": 972 + }, + { + "epoch": 5.969325153374233, + "grad_norm": 2.9217734336853027, + "learning_rate": 1.7550039445247069e-06, + "loss": 0.0541, + "step": 973 + }, + { + "epoch": 5.975460122699387, + "grad_norm": 2.793104410171509, + "learning_rate": 1.7504058546024694e-06, + "loss": 0.0257, + "step": 974 + }, + { + "epoch": 5.9815950920245395, + "grad_norm": 3.5610134601593018, + "learning_rate": 1.7458105492018114e-06, + "loss": 0.0767, + "step": 975 + }, + { + "epoch": 5.987730061349693, + "grad_norm": 2.0738015174865723, + "learning_rate": 1.7412180453929412e-06, + "loss": 0.025, + "step": 976 + }, + { + "epoch": 5.993865030674847, + "grad_norm": 2.1248421669006348, + "learning_rate": 1.736628360235657e-06, + "loss": 0.0183, + "step": 977 + }, + { + "epoch": 6.0, + "grad_norm": 2.901273727416992, + "learning_rate": 1.7320415107792893e-06, + "loss": 0.1369, + "step": 978 + }, + { + "epoch": 6.006134969325154, + "grad_norm": 3.815110683441162, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.1011, + "step": 979 + }, + { + "epoch": 6.012269938650307, + "grad_norm": 2.421208381652832, + "learning_rate": 1.7228763871138845e-06, + "loss": 0.0105, + "step": 980 + }, + { + "epoch": 6.0184049079754605, + "grad_norm": 2.7103846073150635, + "learning_rate": 1.718298146950585e-06, + "loss": 0.0373, + "step": 981 + }, + { + "epoch": 6.024539877300613, + "grad_norm": 1.3751411437988281, + "learning_rate": 1.7137228105795473e-06, + "loss": 0.0072, + "step": 982 + }, + { + "epoch": 6.030674846625767, + "grad_norm": 1.5235071182250977, + "learning_rate": 1.7091503949967987e-06, + "loss": 0.0126, + "step": 983 + }, + { + "epoch": 6.03680981595092, + "grad_norm": 2.0652546882629395, + "learning_rate": 1.7045809171875183e-06, + "loss": 0.0198, + "step": 984 + }, + { + "epoch": 6.042944785276074, + "grad_norm": 2.010207176208496, + "learning_rate": 1.70001439412597e-06, + "loss": 0.0186, + "step": 985 + }, + { + "epoch": 6.049079754601227, + "grad_norm": 2.0444021224975586, + "learning_rate": 1.6954508427754435e-06, + "loss": 0.0197, + "step": 986 + }, + { + "epoch": 6.0552147239263805, + "grad_norm": 2.6540091037750244, + "learning_rate": 1.690890280088187e-06, + "loss": 0.0192, + "step": 987 + }, + { + "epoch": 6.061349693251533, + "grad_norm": 1.6479653120040894, + "learning_rate": 1.6863327230053506e-06, + "loss": 0.0105, + "step": 988 + }, + { + "epoch": 6.067484662576687, + "grad_norm": 2.4434754848480225, + "learning_rate": 1.6817781884569146e-06, + "loss": 0.0275, + "step": 989 + }, + { + "epoch": 6.07361963190184, + "grad_norm": 1.7472137212753296, + "learning_rate": 1.677226693361636e-06, + "loss": 0.0095, + "step": 990 + }, + { + "epoch": 6.079754601226994, + "grad_norm": 2.952821969985962, + "learning_rate": 1.6726782546269793e-06, + "loss": 0.0483, + "step": 991 + }, + { + "epoch": 6.085889570552148, + "grad_norm": 3.123959541320801, + "learning_rate": 1.6681328891490544e-06, + "loss": 0.0815, + "step": 992 + }, + { + "epoch": 6.0920245398773005, + "grad_norm": 2.9924800395965576, + "learning_rate": 1.663590613812556e-06, + "loss": 0.0216, + "step": 993 + }, + { + "epoch": 6.098159509202454, + "grad_norm": 2.417778730392456, + "learning_rate": 1.6590514454907007e-06, + "loss": 0.0243, + "step": 994 + }, + { + "epoch": 6.104294478527607, + "grad_norm": 2.0682942867279053, + "learning_rate": 1.6545154010451613e-06, + "loss": 0.0669, + "step": 995 + }, + { + "epoch": 6.110429447852761, + "grad_norm": 2.9801135063171387, + "learning_rate": 1.6499824973260086e-06, + "loss": 0.0309, + "step": 996 + }, + { + "epoch": 6.116564417177914, + "grad_norm": 1.5753487348556519, + "learning_rate": 1.645452751171645e-06, + "loss": 0.026, + "step": 997 + }, + { + "epoch": 6.122699386503068, + "grad_norm": 2.461124897003174, + "learning_rate": 1.6409261794087438e-06, + "loss": 0.0191, + "step": 998 + }, + { + "epoch": 6.128834355828221, + "grad_norm": 3.839308261871338, + "learning_rate": 1.6364027988521875e-06, + "loss": 0.045, + "step": 999 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 2.9653189182281494, + "learning_rate": 1.6318826263050022e-06, + "loss": 0.0197, + "step": 1000 + }, + { + "epoch": 6.141104294478527, + "grad_norm": 1.1804074048995972, + "learning_rate": 1.6273656785582986e-06, + "loss": 0.0092, + "step": 1001 + }, + { + "epoch": 6.147239263803681, + "grad_norm": 1.9027175903320312, + "learning_rate": 1.6228519723912073e-06, + "loss": 0.0141, + "step": 1002 + }, + { + "epoch": 6.153374233128835, + "grad_norm": 1.831039309501648, + "learning_rate": 1.618341524570819e-06, + "loss": 0.0131, + "step": 1003 + }, + { + "epoch": 6.159509202453988, + "grad_norm": 2.547327756881714, + "learning_rate": 1.613834351852119e-06, + "loss": 0.0686, + "step": 1004 + }, + { + "epoch": 6.1656441717791415, + "grad_norm": 2.746947765350342, + "learning_rate": 1.6093304709779273e-06, + "loss": 0.036, + "step": 1005 + }, + { + "epoch": 6.171779141104294, + "grad_norm": 2.0104732513427734, + "learning_rate": 1.6048298986788345e-06, + "loss": 0.0216, + "step": 1006 + }, + { + "epoch": 6.177914110429448, + "grad_norm": 2.655977725982666, + "learning_rate": 1.6003326516731431e-06, + "loss": 0.024, + "step": 1007 + }, + { + "epoch": 6.184049079754601, + "grad_norm": 2.0733132362365723, + "learning_rate": 1.5958387466668015e-06, + "loss": 0.0133, + "step": 1008 + }, + { + "epoch": 6.190184049079755, + "grad_norm": 2.5398054122924805, + "learning_rate": 1.5913482003533437e-06, + "loss": 0.0331, + "step": 1009 + }, + { + "epoch": 6.196319018404908, + "grad_norm": 1.7983721494674683, + "learning_rate": 1.5868610294138264e-06, + "loss": 0.0111, + "step": 1010 + }, + { + "epoch": 6.2024539877300615, + "grad_norm": 1.7259647846221924, + "learning_rate": 1.58237725051677e-06, + "loss": 0.0112, + "step": 1011 + }, + { + "epoch": 6.208588957055214, + "grad_norm": 1.7722725868225098, + "learning_rate": 1.577896880318093e-06, + "loss": 0.0181, + "step": 1012 + }, + { + "epoch": 6.214723926380368, + "grad_norm": 3.633545398712158, + "learning_rate": 1.5734199354610513e-06, + "loss": 0.0135, + "step": 1013 + }, + { + "epoch": 6.220858895705521, + "grad_norm": 1.8951494693756104, + "learning_rate": 1.5689464325761764e-06, + "loss": 0.0163, + "step": 1014 + }, + { + "epoch": 6.226993865030675, + "grad_norm": 1.637170433998108, + "learning_rate": 1.564476388281216e-06, + "loss": 0.0068, + "step": 1015 + }, + { + "epoch": 6.233128834355828, + "grad_norm": 2.2963850498199463, + "learning_rate": 1.5600098191810682e-06, + "loss": 0.021, + "step": 1016 + }, + { + "epoch": 6.2392638036809815, + "grad_norm": 2.777996063232422, + "learning_rate": 1.555546741867722e-06, + "loss": 0.0349, + "step": 1017 + }, + { + "epoch": 6.245398773006135, + "grad_norm": 2.1580724716186523, + "learning_rate": 1.5510871729201953e-06, + "loss": 0.0626, + "step": 1018 + }, + { + "epoch": 6.251533742331288, + "grad_norm": 1.4158363342285156, + "learning_rate": 1.5466311289044755e-06, + "loss": 0.0082, + "step": 1019 + }, + { + "epoch": 6.257668711656442, + "grad_norm": 3.287564516067505, + "learning_rate": 1.5421786263734524e-06, + "loss": 0.0212, + "step": 1020 + }, + { + "epoch": 6.263803680981595, + "grad_norm": 2.4552016258239746, + "learning_rate": 1.5377296818668638e-06, + "loss": 0.0963, + "step": 1021 + }, + { + "epoch": 6.269938650306749, + "grad_norm": 1.877556562423706, + "learning_rate": 1.5332843119112285e-06, + "loss": 0.011, + "step": 1022 + }, + { + "epoch": 6.276073619631902, + "grad_norm": 3.720372438430786, + "learning_rate": 1.5288425330197864e-06, + "loss": 0.018, + "step": 1023 + }, + { + "epoch": 6.282208588957055, + "grad_norm": 1.9751925468444824, + "learning_rate": 1.5244043616924389e-06, + "loss": 0.0162, + "step": 1024 + }, + { + "epoch": 6.288343558282208, + "grad_norm": 2.5137453079223633, + "learning_rate": 1.5199698144156865e-06, + "loss": 0.0468, + "step": 1025 + }, + { + "epoch": 6.294478527607362, + "grad_norm": 2.111983299255371, + "learning_rate": 1.5155389076625663e-06, + "loss": 0.0064, + "step": 1026 + }, + { + "epoch": 6.300613496932515, + "grad_norm": 2.572223663330078, + "learning_rate": 1.5111116578925924e-06, + "loss": 0.035, + "step": 1027 + }, + { + "epoch": 6.306748466257669, + "grad_norm": 2.7881019115448, + "learning_rate": 1.5066880815516943e-06, + "loss": 0.0197, + "step": 1028 + }, + { + "epoch": 6.3128834355828225, + "grad_norm": 1.2287017107009888, + "learning_rate": 1.5022681950721565e-06, + "loss": 0.0059, + "step": 1029 + }, + { + "epoch": 6.319018404907975, + "grad_norm": 1.764028549194336, + "learning_rate": 1.4978520148725558e-06, + "loss": 0.006, + "step": 1030 + }, + { + "epoch": 6.325153374233129, + "grad_norm": 2.399787664413452, + "learning_rate": 1.4934395573577016e-06, + "loss": 0.0126, + "step": 1031 + }, + { + "epoch": 6.331288343558282, + "grad_norm": 1.9056172370910645, + "learning_rate": 1.4890308389185743e-06, + "loss": 0.0131, + "step": 1032 + }, + { + "epoch": 6.337423312883436, + "grad_norm": 1.7394744157791138, + "learning_rate": 1.484625875932265e-06, + "loss": 0.016, + "step": 1033 + }, + { + "epoch": 6.343558282208589, + "grad_norm": 4.352719306945801, + "learning_rate": 1.480224684761915e-06, + "loss": 0.1059, + "step": 1034 + }, + { + "epoch": 6.3496932515337425, + "grad_norm": 2.148385524749756, + "learning_rate": 1.4758272817566538e-06, + "loss": 0.0312, + "step": 1035 + }, + { + "epoch": 6.355828220858895, + "grad_norm": 2.483872175216675, + "learning_rate": 1.4714336832515386e-06, + "loss": 0.0215, + "step": 1036 + }, + { + "epoch": 6.361963190184049, + "grad_norm": 2.6151270866394043, + "learning_rate": 1.467043905567494e-06, + "loss": 0.0718, + "step": 1037 + }, + { + "epoch": 6.368098159509202, + "grad_norm": 2.554600954055786, + "learning_rate": 1.4626579650112533e-06, + "loss": 0.0166, + "step": 1038 + }, + { + "epoch": 6.374233128834356, + "grad_norm": 3.013974905014038, + "learning_rate": 1.4582758778752926e-06, + "loss": 0.0448, + "step": 1039 + }, + { + "epoch": 6.38036809815951, + "grad_norm": 2.1542789936065674, + "learning_rate": 1.4538976604377781e-06, + "loss": 0.0297, + "step": 1040 + }, + { + "epoch": 6.386503067484663, + "grad_norm": 3.4402377605438232, + "learning_rate": 1.449523328962496e-06, + "loss": 0.0409, + "step": 1041 + }, + { + "epoch": 6.392638036809816, + "grad_norm": 1.6200538873672485, + "learning_rate": 1.4451528996988018e-06, + "loss": 0.0127, + "step": 1042 + }, + { + "epoch": 6.398773006134969, + "grad_norm": 3.081733465194702, + "learning_rate": 1.4407863888815527e-06, + "loss": 0.0788, + "step": 1043 + }, + { + "epoch": 6.404907975460123, + "grad_norm": 1.9813143014907837, + "learning_rate": 1.436423812731051e-06, + "loss": 0.0082, + "step": 1044 + }, + { + "epoch": 6.411042944785276, + "grad_norm": 1.7354048490524292, + "learning_rate": 1.432065187452984e-06, + "loss": 0.0086, + "step": 1045 + }, + { + "epoch": 6.41717791411043, + "grad_norm": 1.8812576532363892, + "learning_rate": 1.4277105292383594e-06, + "loss": 0.04, + "step": 1046 + }, + { + "epoch": 6.423312883435583, + "grad_norm": 1.117837905883789, + "learning_rate": 1.4233598542634519e-06, + "loss": 0.0054, + "step": 1047 + }, + { + "epoch": 6.429447852760736, + "grad_norm": 1.9587867259979248, + "learning_rate": 1.4190131786897388e-06, + "loss": 0.0263, + "step": 1048 + }, + { + "epoch": 6.435582822085889, + "grad_norm": 1.2712376117706299, + "learning_rate": 1.4146705186638388e-06, + "loss": 0.0098, + "step": 1049 + }, + { + "epoch": 6.441717791411043, + "grad_norm": 2.6563849449157715, + "learning_rate": 1.410331890317457e-06, + "loss": 0.0322, + "step": 1050 + }, + { + "epoch": 6.447852760736196, + "grad_norm": 3.136518955230713, + "learning_rate": 1.4059973097673187e-06, + "loss": 0.0729, + "step": 1051 + }, + { + "epoch": 6.45398773006135, + "grad_norm": 1.3937572240829468, + "learning_rate": 1.4016667931151156e-06, + "loss": 0.0094, + "step": 1052 + }, + { + "epoch": 6.460122699386503, + "grad_norm": 1.7218928337097168, + "learning_rate": 1.3973403564474422e-06, + "loss": 0.0078, + "step": 1053 + }, + { + "epoch": 6.466257668711656, + "grad_norm": 2.35612416267395, + "learning_rate": 1.393018015835737e-06, + "loss": 0.0231, + "step": 1054 + }, + { + "epoch": 6.47239263803681, + "grad_norm": 1.96125066280365, + "learning_rate": 1.388699787336224e-06, + "loss": 0.0153, + "step": 1055 + }, + { + "epoch": 6.478527607361963, + "grad_norm": 2.1789233684539795, + "learning_rate": 1.3843856869898486e-06, + "loss": 0.0136, + "step": 1056 + }, + { + "epoch": 6.484662576687117, + "grad_norm": 3.1261701583862305, + "learning_rate": 1.3800757308222263e-06, + "loss": 0.0819, + "step": 1057 + }, + { + "epoch": 6.49079754601227, + "grad_norm": 2.93422794342041, + "learning_rate": 1.3757699348435726e-06, + "loss": 0.0658, + "step": 1058 + }, + { + "epoch": 6.4969325153374236, + "grad_norm": 2.1311776638031006, + "learning_rate": 1.3714683150486534e-06, + "loss": 0.0106, + "step": 1059 + }, + { + "epoch": 6.5030674846625764, + "grad_norm": 1.699877381324768, + "learning_rate": 1.3671708874167211e-06, + "loss": 0.0151, + "step": 1060 + }, + { + "epoch": 6.50920245398773, + "grad_norm": 1.7288825511932373, + "learning_rate": 1.3628776679114516e-06, + "loss": 0.0114, + "step": 1061 + }, + { + "epoch": 6.515337423312883, + "grad_norm": 1.8437966108322144, + "learning_rate": 1.3585886724808934e-06, + "loss": 0.0117, + "step": 1062 + }, + { + "epoch": 6.521472392638037, + "grad_norm": 3.073568344116211, + "learning_rate": 1.3543039170574022e-06, + "loss": 0.0381, + "step": 1063 + }, + { + "epoch": 6.52760736196319, + "grad_norm": 1.6069157123565674, + "learning_rate": 1.350023417557581e-06, + "loss": 0.0072, + "step": 1064 + }, + { + "epoch": 6.533742331288344, + "grad_norm": 2.48502779006958, + "learning_rate": 1.345747189882228e-06, + "loss": 0.0302, + "step": 1065 + }, + { + "epoch": 6.539877300613497, + "grad_norm": 1.6879143714904785, + "learning_rate": 1.3414752499162676e-06, + "loss": 0.0095, + "step": 1066 + }, + { + "epoch": 6.54601226993865, + "grad_norm": 2.2126848697662354, + "learning_rate": 1.3372076135287005e-06, + "loss": 0.067, + "step": 1067 + }, + { + "epoch": 6.552147239263804, + "grad_norm": 2.157269239425659, + "learning_rate": 1.33294429657254e-06, + "loss": 0.0203, + "step": 1068 + }, + { + "epoch": 6.558282208588957, + "grad_norm": 2.725158452987671, + "learning_rate": 1.3286853148847523e-06, + "loss": 0.0217, + "step": 1069 + }, + { + "epoch": 6.564417177914111, + "grad_norm": 2.478426456451416, + "learning_rate": 1.3244306842862007e-06, + "loss": 0.0223, + "step": 1070 + }, + { + "epoch": 6.570552147239264, + "grad_norm": 2.349463939666748, + "learning_rate": 1.3201804205815872e-06, + "loss": 0.027, + "step": 1071 + }, + { + "epoch": 6.576687116564417, + "grad_norm": 2.049593210220337, + "learning_rate": 1.3159345395593876e-06, + "loss": 0.0212, + "step": 1072 + }, + { + "epoch": 6.58282208588957, + "grad_norm": 2.3445141315460205, + "learning_rate": 1.3116930569918024e-06, + "loss": 0.0182, + "step": 1073 + }, + { + "epoch": 6.588957055214724, + "grad_norm": 3.756135940551758, + "learning_rate": 1.3074559886346886e-06, + "loss": 0.1187, + "step": 1074 + }, + { + "epoch": 6.595092024539877, + "grad_norm": 2.4747114181518555, + "learning_rate": 1.3032233502275089e-06, + "loss": 0.0103, + "step": 1075 + }, + { + "epoch": 6.601226993865031, + "grad_norm": 2.0029311180114746, + "learning_rate": 1.2989951574932693e-06, + "loss": 0.0115, + "step": 1076 + }, + { + "epoch": 6.6073619631901845, + "grad_norm": 2.007141351699829, + "learning_rate": 1.2947714261384602e-06, + "loss": 0.0155, + "step": 1077 + }, + { + "epoch": 6.613496932515337, + "grad_norm": 1.5075048208236694, + "learning_rate": 1.2905521718530012e-06, + "loss": 0.0125, + "step": 1078 + }, + { + "epoch": 6.61963190184049, + "grad_norm": 1.9235132932662964, + "learning_rate": 1.2863374103101784e-06, + "loss": 0.0181, + "step": 1079 + }, + { + "epoch": 6.625766871165644, + "grad_norm": 1.7235040664672852, + "learning_rate": 1.2821271571665912e-06, + "loss": 0.0102, + "step": 1080 + }, + { + "epoch": 6.631901840490798, + "grad_norm": 3.503974676132202, + "learning_rate": 1.277921428062091e-06, + "loss": 0.0969, + "step": 1081 + }, + { + "epoch": 6.638036809815951, + "grad_norm": 2.4633288383483887, + "learning_rate": 1.2737202386197222e-06, + "loss": 0.0383, + "step": 1082 + }, + { + "epoch": 6.644171779141105, + "grad_norm": 2.332341432571411, + "learning_rate": 1.2695236044456672e-06, + "loss": 0.0184, + "step": 1083 + }, + { + "epoch": 6.6503067484662575, + "grad_norm": 2.8279805183410645, + "learning_rate": 1.2653315411291867e-06, + "loss": 0.0327, + "step": 1084 + }, + { + "epoch": 6.656441717791411, + "grad_norm": 2.444810628890991, + "learning_rate": 1.2611440642425617e-06, + "loss": 0.0399, + "step": 1085 + }, + { + "epoch": 6.662576687116564, + "grad_norm": 2.9304957389831543, + "learning_rate": 1.2569611893410374e-06, + "loss": 0.0385, + "step": 1086 + }, + { + "epoch": 6.668711656441718, + "grad_norm": 2.1244678497314453, + "learning_rate": 1.2527829319627604e-06, + "loss": 0.0123, + "step": 1087 + }, + { + "epoch": 6.674846625766871, + "grad_norm": 2.129033327102661, + "learning_rate": 1.248609307628729e-06, + "loss": 0.0302, + "step": 1088 + }, + { + "epoch": 6.680981595092025, + "grad_norm": 5.788925647735596, + "learning_rate": 1.2444403318427268e-06, + "loss": 0.0296, + "step": 1089 + }, + { + "epoch": 6.6871165644171775, + "grad_norm": 5.127935886383057, + "learning_rate": 1.2402760200912725e-06, + "loss": 0.1532, + "step": 1090 + }, + { + "epoch": 6.693251533742331, + "grad_norm": 2.2610318660736084, + "learning_rate": 1.2361163878435594e-06, + "loss": 0.0126, + "step": 1091 + }, + { + "epoch": 6.699386503067485, + "grad_norm": 1.7913328409194946, + "learning_rate": 1.2319614505513953e-06, + "loss": 0.0086, + "step": 1092 + }, + { + "epoch": 6.705521472392638, + "grad_norm": 1.5961267948150635, + "learning_rate": 1.227811223649149e-06, + "loss": 0.0041, + "step": 1093 + }, + { + "epoch": 6.711656441717792, + "grad_norm": 1.441754937171936, + "learning_rate": 1.2236657225536938e-06, + "loss": 0.0103, + "step": 1094 + }, + { + "epoch": 6.717791411042945, + "grad_norm": 1.4393174648284912, + "learning_rate": 1.2195249626643432e-06, + "loss": 0.0063, + "step": 1095 + }, + { + "epoch": 6.723926380368098, + "grad_norm": 3.199451208114624, + "learning_rate": 1.2153889593628032e-06, + "loss": 0.0571, + "step": 1096 + }, + { + "epoch": 6.730061349693251, + "grad_norm": 2.1796770095825195, + "learning_rate": 1.211257728013107e-06, + "loss": 0.0269, + "step": 1097 + }, + { + "epoch": 6.736196319018405, + "grad_norm": 3.1798806190490723, + "learning_rate": 1.2071312839615634e-06, + "loss": 0.0396, + "step": 1098 + }, + { + "epoch": 6.742331288343558, + "grad_norm": 3.063633680343628, + "learning_rate": 1.2030096425366985e-06, + "loss": 0.0261, + "step": 1099 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 1.860409140586853, + "learning_rate": 1.1988928190491948e-06, + "loss": 0.013, + "step": 1100 + }, + { + "epoch": 6.754601226993865, + "grad_norm": 1.9303224086761475, + "learning_rate": 1.1947808287918406e-06, + "loss": 0.0113, + "step": 1101 + }, + { + "epoch": 6.7607361963190185, + "grad_norm": 2.1432337760925293, + "learning_rate": 1.19067368703947e-06, + "loss": 0.0195, + "step": 1102 + }, + { + "epoch": 6.766871165644172, + "grad_norm": 1.8998470306396484, + "learning_rate": 1.1865714090489038e-06, + "loss": 0.0105, + "step": 1103 + }, + { + "epoch": 6.773006134969325, + "grad_norm": 2.3260247707366943, + "learning_rate": 1.1824740100588991e-06, + "loss": 0.0554, + "step": 1104 + }, + { + "epoch": 6.779141104294479, + "grad_norm": 1.9272006750106812, + "learning_rate": 1.1783815052900848e-06, + "loss": 0.0118, + "step": 1105 + }, + { + "epoch": 6.785276073619632, + "grad_norm": 3.1646785736083984, + "learning_rate": 1.1742939099449126e-06, + "loss": 0.0901, + "step": 1106 + }, + { + "epoch": 6.791411042944786, + "grad_norm": 3.357422351837158, + "learning_rate": 1.1702112392075966e-06, + "loss": 0.0833, + "step": 1107 + }, + { + "epoch": 6.7975460122699385, + "grad_norm": 1.4302526712417603, + "learning_rate": 1.1661335082440545e-06, + "loss": 0.0078, + "step": 1108 + }, + { + "epoch": 6.803680981595092, + "grad_norm": 1.3046417236328125, + "learning_rate": 1.1620607322018587e-06, + "loss": 0.0092, + "step": 1109 + }, + { + "epoch": 6.809815950920245, + "grad_norm": 2.084237813949585, + "learning_rate": 1.1579929262101712e-06, + "loss": 0.0283, + "step": 1110 + }, + { + "epoch": 6.815950920245399, + "grad_norm": 1.9403250217437744, + "learning_rate": 1.153930105379695e-06, + "loss": 0.0066, + "step": 1111 + }, + { + "epoch": 6.822085889570552, + "grad_norm": 2.282449722290039, + "learning_rate": 1.1498722848026142e-06, + "loss": 0.0402, + "step": 1112 + }, + { + "epoch": 6.828220858895706, + "grad_norm": 1.9357627630233765, + "learning_rate": 1.1458194795525354e-06, + "loss": 0.0101, + "step": 1113 + }, + { + "epoch": 6.8343558282208585, + "grad_norm": 2.0236339569091797, + "learning_rate": 1.1417717046844385e-06, + "loss": 0.0109, + "step": 1114 + }, + { + "epoch": 6.840490797546012, + "grad_norm": 2.386857032775879, + "learning_rate": 1.137728975234615e-06, + "loss": 0.0297, + "step": 1115 + }, + { + "epoch": 6.846625766871165, + "grad_norm": 2.2477970123291016, + "learning_rate": 1.1336913062206157e-06, + "loss": 0.0393, + "step": 1116 + }, + { + "epoch": 6.852760736196319, + "grad_norm": 2.7217776775360107, + "learning_rate": 1.129658712641192e-06, + "loss": 0.0269, + "step": 1117 + }, + { + "epoch": 6.858895705521473, + "grad_norm": 2.6717259883880615, + "learning_rate": 1.125631209476241e-06, + "loss": 0.0708, + "step": 1118 + }, + { + "epoch": 6.865030674846626, + "grad_norm": 2.951939344406128, + "learning_rate": 1.1216088116867524e-06, + "loss": 0.0835, + "step": 1119 + }, + { + "epoch": 6.871165644171779, + "grad_norm": 1.9705166816711426, + "learning_rate": 1.1175915342147486e-06, + "loss": 0.0107, + "step": 1120 + }, + { + "epoch": 6.877300613496932, + "grad_norm": 2.4005937576293945, + "learning_rate": 1.1135793919832336e-06, + "loss": 0.0139, + "step": 1121 + }, + { + "epoch": 6.883435582822086, + "grad_norm": 2.277463674545288, + "learning_rate": 1.1095723998961353e-06, + "loss": 0.0154, + "step": 1122 + }, + { + "epoch": 6.889570552147239, + "grad_norm": 1.5026034116744995, + "learning_rate": 1.1055705728382482e-06, + "loss": 0.0072, + "step": 1123 + }, + { + "epoch": 6.895705521472393, + "grad_norm": 1.9540379047393799, + "learning_rate": 1.1015739256751826e-06, + "loss": 0.0202, + "step": 1124 + }, + { + "epoch": 6.901840490797546, + "grad_norm": 2.3090603351593018, + "learning_rate": 1.0975824732533066e-06, + "loss": 0.0559, + "step": 1125 + }, + { + "epoch": 6.9079754601226995, + "grad_norm": 2.100283622741699, + "learning_rate": 1.09359623039969e-06, + "loss": 0.0385, + "step": 1126 + }, + { + "epoch": 6.914110429447852, + "grad_norm": 2.4120566844940186, + "learning_rate": 1.0896152119220525e-06, + "loss": 0.0535, + "step": 1127 + }, + { + "epoch": 6.920245398773006, + "grad_norm": 2.003495454788208, + "learning_rate": 1.0856394326087045e-06, + "loss": 0.0104, + "step": 1128 + }, + { + "epoch": 6.92638036809816, + "grad_norm": 1.6565535068511963, + "learning_rate": 1.0816689072284962e-06, + "loss": 0.0121, + "step": 1129 + }, + { + "epoch": 6.932515337423313, + "grad_norm": 1.6503472328186035, + "learning_rate": 1.0777036505307616e-06, + "loss": 0.0056, + "step": 1130 + }, + { + "epoch": 6.938650306748467, + "grad_norm": 2.600112199783325, + "learning_rate": 1.0737436772452602e-06, + "loss": 0.0198, + "step": 1131 + }, + { + "epoch": 6.9447852760736195, + "grad_norm": 1.6668883562088013, + "learning_rate": 1.0697890020821292e-06, + "loss": 0.0077, + "step": 1132 + }, + { + "epoch": 6.950920245398773, + "grad_norm": 2.729172706604004, + "learning_rate": 1.0658396397318203e-06, + "loss": 0.0329, + "step": 1133 + }, + { + "epoch": 6.957055214723926, + "grad_norm": 1.5219136476516724, + "learning_rate": 1.061895604865053e-06, + "loss": 0.0113, + "step": 1134 + }, + { + "epoch": 6.96319018404908, + "grad_norm": 3.8395588397979736, + "learning_rate": 1.057956912132757e-06, + "loss": 0.0376, + "step": 1135 + }, + { + "epoch": 6.969325153374233, + "grad_norm": 2.4347221851348877, + "learning_rate": 1.054023576166014e-06, + "loss": 0.0517, + "step": 1136 + }, + { + "epoch": 6.975460122699387, + "grad_norm": 3.079165458679199, + "learning_rate": 1.0500956115760105e-06, + "loss": 0.0373, + "step": 1137 + }, + { + "epoch": 6.9815950920245395, + "grad_norm": 1.9391908645629883, + "learning_rate": 1.0461730329539794e-06, + "loss": 0.019, + "step": 1138 + }, + { + "epoch": 6.987730061349693, + "grad_norm": 1.8693119287490845, + "learning_rate": 1.0422558548711434e-06, + "loss": 0.0073, + "step": 1139 + }, + { + "epoch": 6.993865030674847, + "grad_norm": 3.0920307636260986, + "learning_rate": 1.0383440918786684e-06, + "loss": 0.0099, + "step": 1140 + }, + { + "epoch": 7.0, + "grad_norm": 3.184906244277954, + "learning_rate": 1.0344377585076e-06, + "loss": 0.0218, + "step": 1141 + }, + { + "epoch": 7.006134969325154, + "grad_norm": 0.7609673142433167, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.0024, + "step": 1142 + }, + { + "epoch": 7.012269938650307, + "grad_norm": 1.1493247747421265, + "learning_rate": 1.0266414386529775e-06, + "loss": 0.0059, + "step": 1143 + }, + { + "epoch": 7.0184049079754605, + "grad_norm": 3.534796953201294, + "learning_rate": 1.0227514811304556e-06, + "loss": 0.0843, + "step": 1144 + }, + { + "epoch": 7.024539877300613, + "grad_norm": 1.1876507997512817, + "learning_rate": 1.0188670111513002e-06, + "loss": 0.0098, + "step": 1145 + }, + { + "epoch": 7.030674846625767, + "grad_norm": 1.2825753688812256, + "learning_rate": 1.0149880431451736e-06, + "loss": 0.0042, + "step": 1146 + }, + { + "epoch": 7.03680981595092, + "grad_norm": 0.6842563152313232, + "learning_rate": 1.0111145915213e-06, + "loss": 0.003, + "step": 1147 + }, + { + "epoch": 7.042944785276074, + "grad_norm": 0.6310113072395325, + "learning_rate": 1.0072466706684127e-06, + "loss": 0.0027, + "step": 1148 + }, + { + "epoch": 7.049079754601227, + "grad_norm": 1.484761357307434, + "learning_rate": 1.0033842949546974e-06, + "loss": 0.0105, + "step": 1149 + }, + { + "epoch": 7.0552147239263805, + "grad_norm": 1.9790291786193848, + "learning_rate": 9.995274787277445e-07, + "loss": 0.0233, + "step": 1150 + }, + { + "epoch": 7.061349693251533, + "grad_norm": 1.1398522853851318, + "learning_rate": 9.956762363144892e-07, + "loss": 0.0031, + "step": 1151 + }, + { + "epoch": 7.067484662576687, + "grad_norm": 1.0574359893798828, + "learning_rate": 9.918305820211643e-07, + "loss": 0.0047, + "step": 1152 + }, + { + "epoch": 7.07361963190184, + "grad_norm": 2.463972330093384, + "learning_rate": 9.879905301332439e-07, + "loss": 0.0334, + "step": 1153 + }, + { + "epoch": 7.079754601226994, + "grad_norm": 1.4698575735092163, + "learning_rate": 9.84156094915389e-07, + "loss": 0.0191, + "step": 1154 + }, + { + "epoch": 7.085889570552148, + "grad_norm": 1.2635239362716675, + "learning_rate": 9.803272906113978e-07, + "loss": 0.0045, + "step": 1155 + }, + { + "epoch": 7.0920245398773005, + "grad_norm": 1.7271842956542969, + "learning_rate": 9.765041314441529e-07, + "loss": 0.0042, + "step": 1156 + }, + { + "epoch": 7.098159509202454, + "grad_norm": 1.5738918781280518, + "learning_rate": 9.72686631615563e-07, + "loss": 0.0066, + "step": 1157 + }, + { + "epoch": 7.104294478527607, + "grad_norm": 1.3097981214523315, + "learning_rate": 9.688748053065179e-07, + "loss": 0.0058, + "step": 1158 + }, + { + "epoch": 7.110429447852761, + "grad_norm": 2.076064348220825, + "learning_rate": 9.65068666676828e-07, + "loss": 0.0067, + "step": 1159 + }, + { + "epoch": 7.116564417177914, + "grad_norm": 1.1589064598083496, + "learning_rate": 9.612682298651792e-07, + "loss": 0.0052, + "step": 1160 + }, + { + "epoch": 7.122699386503068, + "grad_norm": 1.6450324058532715, + "learning_rate": 9.574735089890765e-07, + "loss": 0.0035, + "step": 1161 + }, + { + "epoch": 7.128834355828221, + "grad_norm": 1.6968387365341187, + "learning_rate": 9.53684518144789e-07, + "loss": 0.0126, + "step": 1162 + }, + { + "epoch": 7.134969325153374, + "grad_norm": 1.9047832489013672, + "learning_rate": 9.499012714073036e-07, + "loss": 0.0345, + "step": 1163 + }, + { + "epoch": 7.141104294478527, + "grad_norm": 1.7587796449661255, + "learning_rate": 9.461237828302666e-07, + "loss": 0.0144, + "step": 1164 + }, + { + "epoch": 7.147239263803681, + "grad_norm": 1.863775372505188, + "learning_rate": 9.423520664459374e-07, + "loss": 0.0135, + "step": 1165 + }, + { + "epoch": 7.153374233128835, + "grad_norm": 2.6580259799957275, + "learning_rate": 9.385861362651322e-07, + "loss": 0.0138, + "step": 1166 + }, + { + "epoch": 7.159509202453988, + "grad_norm": 2.086371421813965, + "learning_rate": 9.348260062771713e-07, + "loss": 0.0093, + "step": 1167 + }, + { + "epoch": 7.1656441717791415, + "grad_norm": 1.0806611776351929, + "learning_rate": 9.310716904498321e-07, + "loss": 0.003, + "step": 1168 + }, + { + "epoch": 7.171779141104294, + "grad_norm": 1.2487165927886963, + "learning_rate": 9.273232027292933e-07, + "loss": 0.0033, + "step": 1169 + }, + { + "epoch": 7.177914110429448, + "grad_norm": 1.0647703409194946, + "learning_rate": 9.235805570400813e-07, + "loss": 0.0024, + "step": 1170 + }, + { + "epoch": 7.184049079754601, + "grad_norm": 1.6039917469024658, + "learning_rate": 9.198437672850249e-07, + "loss": 0.0118, + "step": 1171 + }, + { + "epoch": 7.190184049079755, + "grad_norm": 2.199977159500122, + "learning_rate": 9.161128473451967e-07, + "loss": 0.0173, + "step": 1172 + }, + { + "epoch": 7.196319018404908, + "grad_norm": 2.51725697517395, + "learning_rate": 9.123878110798662e-07, + "loss": 0.0142, + "step": 1173 + }, + { + "epoch": 7.2024539877300615, + "grad_norm": 1.841742753982544, + "learning_rate": 9.086686723264474e-07, + "loss": 0.012, + "step": 1174 + }, + { + "epoch": 7.208588957055214, + "grad_norm": 1.212876319885254, + "learning_rate": 9.049554449004447e-07, + "loss": 0.0055, + "step": 1175 + }, + { + "epoch": 7.214723926380368, + "grad_norm": 1.3728275299072266, + "learning_rate": 9.012481425954053e-07, + "loss": 0.0043, + "step": 1176 + }, + { + "epoch": 7.220858895705521, + "grad_norm": 2.3055357933044434, + "learning_rate": 8.97546779182866e-07, + "loss": 0.0443, + "step": 1177 + }, + { + "epoch": 7.226993865030675, + "grad_norm": 2.017620801925659, + "learning_rate": 8.938513684123024e-07, + "loss": 0.0082, + "step": 1178 + }, + { + "epoch": 7.233128834355828, + "grad_norm": 1.5641282796859741, + "learning_rate": 8.901619240110781e-07, + "loss": 0.0071, + "step": 1179 + }, + { + "epoch": 7.2392638036809815, + "grad_norm": 1.3781960010528564, + "learning_rate": 8.864784596843917e-07, + "loss": 0.0056, + "step": 1180 + }, + { + "epoch": 7.245398773006135, + "grad_norm": 1.23178231716156, + "learning_rate": 8.828009891152301e-07, + "loss": 0.0076, + "step": 1181 + }, + { + "epoch": 7.251533742331288, + "grad_norm": 2.809582233428955, + "learning_rate": 8.791295259643126e-07, + "loss": 0.0141, + "step": 1182 + }, + { + "epoch": 7.257668711656442, + "grad_norm": 1.6520317792892456, + "learning_rate": 8.754640838700443e-07, + "loss": 0.01, + "step": 1183 + }, + { + "epoch": 7.263803680981595, + "grad_norm": 1.411852478981018, + "learning_rate": 8.718046764484648e-07, + "loss": 0.009, + "step": 1184 + }, + { + "epoch": 7.269938650306749, + "grad_norm": 2.9334425926208496, + "learning_rate": 8.681513172931935e-07, + "loss": 0.0291, + "step": 1185 + }, + { + "epoch": 7.276073619631902, + "grad_norm": 1.4273028373718262, + "learning_rate": 8.64504019975386e-07, + "loss": 0.0064, + "step": 1186 + }, + { + "epoch": 7.282208588957055, + "grad_norm": 1.9486448764801025, + "learning_rate": 8.608627980436765e-07, + "loss": 0.0135, + "step": 1187 + }, + { + "epoch": 7.288343558282208, + "grad_norm": 1.3740493059158325, + "learning_rate": 8.572276650241329e-07, + "loss": 0.0061, + "step": 1188 + }, + { + "epoch": 7.294478527607362, + "grad_norm": 1.3352797031402588, + "learning_rate": 8.535986344202057e-07, + "loss": 0.0051, + "step": 1189 + }, + { + "epoch": 7.300613496932515, + "grad_norm": 1.0336774587631226, + "learning_rate": 8.499757197126732e-07, + "loss": 0.0052, + "step": 1190 + }, + { + "epoch": 7.306748466257669, + "grad_norm": 1.1450837850570679, + "learning_rate": 8.463589343595976e-07, + "loss": 0.0111, + "step": 1191 + }, + { + "epoch": 7.3128834355828225, + "grad_norm": 2.504876136779785, + "learning_rate": 8.427482917962734e-07, + "loss": 0.0279, + "step": 1192 + }, + { + "epoch": 7.319018404907975, + "grad_norm": 1.569841980934143, + "learning_rate": 8.391438054351725e-07, + "loss": 0.0105, + "step": 1193 + }, + { + "epoch": 7.325153374233129, + "grad_norm": 1.218538761138916, + "learning_rate": 8.355454886659026e-07, + "loss": 0.0028, + "step": 1194 + }, + { + "epoch": 7.331288343558282, + "grad_norm": 2.084049940109253, + "learning_rate": 8.319533548551492e-07, + "loss": 0.0102, + "step": 1195 + }, + { + "epoch": 7.337423312883436, + "grad_norm": 2.326167345046997, + "learning_rate": 8.28367417346633e-07, + "loss": 0.0396, + "step": 1196 + }, + { + "epoch": 7.343558282208589, + "grad_norm": 1.2704310417175293, + "learning_rate": 8.247876894610568e-07, + "loss": 0.006, + "step": 1197 + }, + { + "epoch": 7.3496932515337425, + "grad_norm": 1.358012318611145, + "learning_rate": 8.212141844960544e-07, + "loss": 0.0075, + "step": 1198 + }, + { + "epoch": 7.355828220858895, + "grad_norm": 1.5145729780197144, + "learning_rate": 8.17646915726146e-07, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 7.361963190184049, + "grad_norm": 1.203041911125183, + "learning_rate": 8.140858964026849e-07, + "loss": 0.0032, + "step": 1200 + }, + { + "epoch": 7.368098159509202, + "grad_norm": 3.031280279159546, + "learning_rate": 8.105311397538085e-07, + "loss": 0.032, + "step": 1201 + }, + { + "epoch": 7.374233128834356, + "grad_norm": 1.416698694229126, + "learning_rate": 8.069826589843929e-07, + "loss": 0.0185, + "step": 1202 + }, + { + "epoch": 7.38036809815951, + "grad_norm": 0.9656457901000977, + "learning_rate": 8.034404672759977e-07, + "loss": 0.0034, + "step": 1203 + }, + { + "epoch": 7.386503067484663, + "grad_norm": 1.7239291667938232, + "learning_rate": 7.99904577786823e-07, + "loss": 0.034, + "step": 1204 + }, + { + "epoch": 7.392638036809816, + "grad_norm": 1.1560636758804321, + "learning_rate": 7.963750036516585e-07, + "loss": 0.005, + "step": 1205 + }, + { + "epoch": 7.398773006134969, + "grad_norm": 1.057456374168396, + "learning_rate": 7.928517579818312e-07, + "loss": 0.0073, + "step": 1206 + }, + { + "epoch": 7.404907975460123, + "grad_norm": 1.4066674709320068, + "learning_rate": 7.893348538651635e-07, + "loss": 0.015, + "step": 1207 + }, + { + "epoch": 7.411042944785276, + "grad_norm": 1.1061445474624634, + "learning_rate": 7.858243043659161e-07, + "loss": 0.004, + "step": 1208 + }, + { + "epoch": 7.41717791411043, + "grad_norm": 0.9575282335281372, + "learning_rate": 7.823201225247496e-07, + "loss": 0.003, + "step": 1209 + }, + { + "epoch": 7.423312883435583, + "grad_norm": 1.3790507316589355, + "learning_rate": 7.788223213586677e-07, + "loss": 0.0096, + "step": 1210 + }, + { + "epoch": 7.429447852760736, + "grad_norm": 1.1366883516311646, + "learning_rate": 7.753309138609705e-07, + "loss": 0.006, + "step": 1211 + }, + { + "epoch": 7.435582822085889, + "grad_norm": 2.2659928798675537, + "learning_rate": 7.71845913001211e-07, + "loss": 0.0074, + "step": 1212 + }, + { + "epoch": 7.441717791411043, + "grad_norm": 1.2541831731796265, + "learning_rate": 7.683673317251392e-07, + "loss": 0.0051, + "step": 1213 + }, + { + "epoch": 7.447852760736196, + "grad_norm": 1.5959513187408447, + "learning_rate": 7.648951829546619e-07, + "loss": 0.0271, + "step": 1214 + }, + { + "epoch": 7.45398773006135, + "grad_norm": 1.368452548980713, + "learning_rate": 7.6142947958779e-07, + "loss": 0.0155, + "step": 1215 + }, + { + "epoch": 7.460122699386503, + "grad_norm": 1.1851825714111328, + "learning_rate": 7.579702344985899e-07, + "loss": 0.0032, + "step": 1216 + }, + { + "epoch": 7.466257668711656, + "grad_norm": 1.419812560081482, + "learning_rate": 7.545174605371403e-07, + "loss": 0.0037, + "step": 1217 + }, + { + "epoch": 7.47239263803681, + "grad_norm": 1.0817372798919678, + "learning_rate": 7.510711705294782e-07, + "loss": 0.0064, + "step": 1218 + }, + { + "epoch": 7.478527607361963, + "grad_norm": 1.0459797382354736, + "learning_rate": 7.476313772775578e-07, + "loss": 0.0055, + "step": 1219 + }, + { + "epoch": 7.484662576687117, + "grad_norm": 1.4481663703918457, + "learning_rate": 7.441980935591986e-07, + "loss": 0.0049, + "step": 1220 + }, + { + "epoch": 7.49079754601227, + "grad_norm": 1.7337101697921753, + "learning_rate": 7.407713321280377e-07, + "loss": 0.0123, + "step": 1221 + }, + { + "epoch": 7.4969325153374236, + "grad_norm": 1.3378303050994873, + "learning_rate": 7.373511057134855e-07, + "loss": 0.0056, + "step": 1222 + }, + { + "epoch": 7.5030674846625764, + "grad_norm": 2.4353835582733154, + "learning_rate": 7.339374270206772e-07, + "loss": 0.0155, + "step": 1223 + }, + { + "epoch": 7.50920245398773, + "grad_norm": 2.2856571674346924, + "learning_rate": 7.305303087304227e-07, + "loss": 0.0303, + "step": 1224 + }, + { + "epoch": 7.515337423312883, + "grad_norm": 1.0627055168151855, + "learning_rate": 7.271297634991651e-07, + "loss": 0.0018, + "step": 1225 + }, + { + "epoch": 7.521472392638037, + "grad_norm": 1.2120238542556763, + "learning_rate": 7.237358039589271e-07, + "loss": 0.0064, + "step": 1226 + }, + { + "epoch": 7.52760736196319, + "grad_norm": 1.1861765384674072, + "learning_rate": 7.203484427172702e-07, + "loss": 0.0025, + "step": 1227 + }, + { + "epoch": 7.533742331288344, + "grad_norm": 1.6700332164764404, + "learning_rate": 7.169676923572447e-07, + "loss": 0.0067, + "step": 1228 + }, + { + "epoch": 7.539877300613497, + "grad_norm": 1.4527982473373413, + "learning_rate": 7.135935654373416e-07, + "loss": 0.0082, + "step": 1229 + }, + { + "epoch": 7.54601226993865, + "grad_norm": 1.1425046920776367, + "learning_rate": 7.102260744914499e-07, + "loss": 0.0042, + "step": 1230 + }, + { + "epoch": 7.552147239263804, + "grad_norm": 2.0762295722961426, + "learning_rate": 7.068652320288081e-07, + "loss": 0.0374, + "step": 1231 + }, + { + "epoch": 7.558282208588957, + "grad_norm": 1.2008321285247803, + "learning_rate": 7.035110505339546e-07, + "loss": 0.0022, + "step": 1232 + }, + { + "epoch": 7.564417177914111, + "grad_norm": 1.262100338935852, + "learning_rate": 7.001635424666878e-07, + "loss": 0.006, + "step": 1233 + }, + { + "epoch": 7.570552147239264, + "grad_norm": 1.8173811435699463, + "learning_rate": 6.968227202620137e-07, + "loss": 0.0137, + "step": 1234 + }, + { + "epoch": 7.576687116564417, + "grad_norm": 1.6977999210357666, + "learning_rate": 6.934885963301033e-07, + "loss": 0.0216, + "step": 1235 + }, + { + "epoch": 7.58282208588957, + "grad_norm": 0.7084318399429321, + "learning_rate": 6.901611830562469e-07, + "loss": 0.0027, + "step": 1236 + }, + { + "epoch": 7.588957055214724, + "grad_norm": 2.0332374572753906, + "learning_rate": 6.868404928008035e-07, + "loss": 0.0391, + "step": 1237 + }, + { + "epoch": 7.595092024539877, + "grad_norm": 1.235734224319458, + "learning_rate": 6.835265378991613e-07, + "loss": 0.0053, + "step": 1238 + }, + { + "epoch": 7.601226993865031, + "grad_norm": 2.687920331954956, + "learning_rate": 6.802193306616858e-07, + "loss": 0.0395, + "step": 1239 + }, + { + "epoch": 7.6073619631901845, + "grad_norm": 1.4211101531982422, + "learning_rate": 6.769188833736781e-07, + "loss": 0.0055, + "step": 1240 + }, + { + "epoch": 7.613496932515337, + "grad_norm": 2.4542644023895264, + "learning_rate": 6.736252082953307e-07, + "loss": 0.0072, + "step": 1241 + }, + { + "epoch": 7.61963190184049, + "grad_norm": 1.2946943044662476, + "learning_rate": 6.703383176616743e-07, + "loss": 0.0046, + "step": 1242 + }, + { + "epoch": 7.625766871165644, + "grad_norm": 3.8073277473449707, + "learning_rate": 6.670582236825421e-07, + "loss": 0.0742, + "step": 1243 + }, + { + "epoch": 7.631901840490798, + "grad_norm": 1.4291348457336426, + "learning_rate": 6.637849385425157e-07, + "loss": 0.0069, + "step": 1244 + }, + { + "epoch": 7.638036809815951, + "grad_norm": 1.1767655611038208, + "learning_rate": 6.605184744008866e-07, + "loss": 0.0031, + "step": 1245 + }, + { + "epoch": 7.644171779141105, + "grad_norm": 1.837077260017395, + "learning_rate": 6.572588433916082e-07, + "loss": 0.0316, + "step": 1246 + }, + { + "epoch": 7.6503067484662575, + "grad_norm": 1.9157041311264038, + "learning_rate": 6.540060576232488e-07, + "loss": 0.0472, + "step": 1247 + }, + { + "epoch": 7.656441717791411, + "grad_norm": 1.7347630262374878, + "learning_rate": 6.507601291789515e-07, + "loss": 0.0059, + "step": 1248 + }, + { + "epoch": 7.662576687116564, + "grad_norm": 0.9757588505744934, + "learning_rate": 6.475210701163828e-07, + "loss": 0.0023, + "step": 1249 + }, + { + "epoch": 7.668711656441718, + "grad_norm": 1.9460281133651733, + "learning_rate": 6.442888924676951e-07, + "loss": 0.0207, + "step": 1250 + }, + { + "epoch": 7.674846625766871, + "grad_norm": 0.7517938613891602, + "learning_rate": 6.410636082394772e-07, + "loss": 0.002, + "step": 1251 + }, + { + "epoch": 7.680981595092025, + "grad_norm": 1.0631566047668457, + "learning_rate": 6.378452294127091e-07, + "loss": 0.0038, + "step": 1252 + }, + { + "epoch": 7.6871165644171775, + "grad_norm": 0.9524463415145874, + "learning_rate": 6.346337679427214e-07, + "loss": 0.0024, + "step": 1253 + }, + { + "epoch": 7.693251533742331, + "grad_norm": 1.3653123378753662, + "learning_rate": 6.314292357591489e-07, + "loss": 0.0027, + "step": 1254 + }, + { + "epoch": 7.699386503067485, + "grad_norm": 1.2446377277374268, + "learning_rate": 6.282316447658837e-07, + "loss": 0.0048, + "step": 1255 + }, + { + "epoch": 7.705521472392638, + "grad_norm": 1.716244101524353, + "learning_rate": 6.250410068410367e-07, + "loss": 0.0064, + "step": 1256 + }, + { + "epoch": 7.711656441717792, + "grad_norm": 1.7151219844818115, + "learning_rate": 6.218573338368869e-07, + "loss": 0.0056, + "step": 1257 + }, + { + "epoch": 7.717791411042945, + "grad_norm": 1.8013248443603516, + "learning_rate": 6.186806375798429e-07, + "loss": 0.0073, + "step": 1258 + }, + { + "epoch": 7.723926380368098, + "grad_norm": 1.051620602607727, + "learning_rate": 6.155109298703968e-07, + "loss": 0.0043, + "step": 1259 + }, + { + "epoch": 7.730061349693251, + "grad_norm": 1.5731337070465088, + "learning_rate": 6.123482224830787e-07, + "loss": 0.0108, + "step": 1260 + }, + { + "epoch": 7.736196319018405, + "grad_norm": 2.232144832611084, + "learning_rate": 6.091925271664156e-07, + "loss": 0.0337, + "step": 1261 + }, + { + "epoch": 7.742331288343558, + "grad_norm": 1.072678565979004, + "learning_rate": 6.060438556428877e-07, + "loss": 0.0019, + "step": 1262 + }, + { + "epoch": 7.748466257668712, + "grad_norm": 2.3631110191345215, + "learning_rate": 6.02902219608881e-07, + "loss": 0.0089, + "step": 1263 + }, + { + "epoch": 7.754601226993865, + "grad_norm": 1.1171438694000244, + "learning_rate": 5.997676307346504e-07, + "loss": 0.0045, + "step": 1264 + }, + { + "epoch": 7.7607361963190185, + "grad_norm": 0.7839979529380798, + "learning_rate": 5.966401006642689e-07, + "loss": 0.0028, + "step": 1265 + }, + { + "epoch": 7.766871165644172, + "grad_norm": 1.5938968658447266, + "learning_rate": 5.93519641015591e-07, + "loss": 0.009, + "step": 1266 + }, + { + "epoch": 7.773006134969325, + "grad_norm": 1.2980104684829712, + "learning_rate": 5.904062633802066e-07, + "loss": 0.0168, + "step": 1267 + }, + { + "epoch": 7.779141104294479, + "grad_norm": 1.177626371383667, + "learning_rate": 5.872999793233952e-07, + "loss": 0.0029, + "step": 1268 + }, + { + "epoch": 7.785276073619632, + "grad_norm": 2.0138931274414062, + "learning_rate": 5.842008003840891e-07, + "loss": 0.015, + "step": 1269 + }, + { + "epoch": 7.791411042944786, + "grad_norm": 1.7204387187957764, + "learning_rate": 5.811087380748245e-07, + "loss": 0.011, + "step": 1270 + }, + { + "epoch": 7.7975460122699385, + "grad_norm": 1.506241798400879, + "learning_rate": 5.780238038817035e-07, + "loss": 0.0057, + "step": 1271 + }, + { + "epoch": 7.803680981595092, + "grad_norm": 2.0950393676757812, + "learning_rate": 5.74946009264348e-07, + "loss": 0.0131, + "step": 1272 + }, + { + "epoch": 7.809815950920245, + "grad_norm": 2.1451432704925537, + "learning_rate": 5.71875365655859e-07, + "loss": 0.0088, + "step": 1273 + }, + { + "epoch": 7.815950920245399, + "grad_norm": 0.9690236449241638, + "learning_rate": 5.688118844627746e-07, + "loss": 0.0033, + "step": 1274 + }, + { + "epoch": 7.822085889570552, + "grad_norm": 1.5690608024597168, + "learning_rate": 5.657555770650241e-07, + "loss": 0.0206, + "step": 1275 + }, + { + "epoch": 7.828220858895706, + "grad_norm": 1.8220988512039185, + "learning_rate": 5.627064548158903e-07, + "loss": 0.0096, + "step": 1276 + }, + { + "epoch": 7.8343558282208585, + "grad_norm": 2.3800559043884277, + "learning_rate": 5.596645290419653e-07, + "loss": 0.008, + "step": 1277 + }, + { + "epoch": 7.840490797546012, + "grad_norm": 0.7775714993476868, + "learning_rate": 5.566298110431068e-07, + "loss": 0.0016, + "step": 1278 + }, + { + "epoch": 7.846625766871165, + "grad_norm": 1.1196876764297485, + "learning_rate": 5.536023120924e-07, + "loss": 0.0033, + "step": 1279 + }, + { + "epoch": 7.852760736196319, + "grad_norm": 1.3722344636917114, + "learning_rate": 5.505820434361108e-07, + "loss": 0.0084, + "step": 1280 + }, + { + "epoch": 7.858895705521473, + "grad_norm": 1.2068676948547363, + "learning_rate": 5.47569016293649e-07, + "loss": 0.0049, + "step": 1281 + }, + { + "epoch": 7.865030674846626, + "grad_norm": 1.096085548400879, + "learning_rate": 5.445632418575239e-07, + "loss": 0.0019, + "step": 1282 + }, + { + "epoch": 7.871165644171779, + "grad_norm": 1.3178106546401978, + "learning_rate": 5.415647312933015e-07, + "loss": 0.0062, + "step": 1283 + }, + { + "epoch": 7.877300613496932, + "grad_norm": 1.2884724140167236, + "learning_rate": 5.385734957395664e-07, + "loss": 0.0081, + "step": 1284 + }, + { + "epoch": 7.883435582822086, + "grad_norm": 0.9866589307785034, + "learning_rate": 5.355895463078789e-07, + "loss": 0.0048, + "step": 1285 + }, + { + "epoch": 7.889570552147239, + "grad_norm": 1.5396437644958496, + "learning_rate": 5.326128940827313e-07, + "loss": 0.0088, + "step": 1286 + }, + { + "epoch": 7.895705521472393, + "grad_norm": 1.1183607578277588, + "learning_rate": 5.296435501215116e-07, + "loss": 0.0043, + "step": 1287 + }, + { + "epoch": 7.901840490797546, + "grad_norm": 1.5337073802947998, + "learning_rate": 5.266815254544572e-07, + "loss": 0.0099, + "step": 1288 + }, + { + "epoch": 7.9079754601226995, + "grad_norm": 1.8188867568969727, + "learning_rate": 5.237268310846183e-07, + "loss": 0.0086, + "step": 1289 + }, + { + "epoch": 7.914110429447852, + "grad_norm": 1.972072720527649, + "learning_rate": 5.207794779878156e-07, + "loss": 0.0442, + "step": 1290 + }, + { + "epoch": 7.920245398773006, + "grad_norm": 1.1226261854171753, + "learning_rate": 5.178394771125969e-07, + "loss": 0.0071, + "step": 1291 + }, + { + "epoch": 7.92638036809816, + "grad_norm": 1.5612869262695312, + "learning_rate": 5.149068393802009e-07, + "loss": 0.0192, + "step": 1292 + }, + { + "epoch": 7.932515337423313, + "grad_norm": 1.1532280445098877, + "learning_rate": 5.119815756845123e-07, + "loss": 0.0032, + "step": 1293 + }, + { + "epoch": 7.938650306748467, + "grad_norm": 1.8807255029678345, + "learning_rate": 5.090636968920252e-07, + "loss": 0.0139, + "step": 1294 + }, + { + "epoch": 7.9447852760736195, + "grad_norm": 1.3027002811431885, + "learning_rate": 5.061532138418013e-07, + "loss": 0.0071, + "step": 1295 + }, + { + "epoch": 7.950920245398773, + "grad_norm": 1.584154486656189, + "learning_rate": 5.032501373454266e-07, + "loss": 0.0056, + "step": 1296 + }, + { + "epoch": 7.957055214723926, + "grad_norm": 1.7631733417510986, + "learning_rate": 5.003544781869762e-07, + "loss": 0.0239, + "step": 1297 + }, + { + "epoch": 7.96319018404908, + "grad_norm": 1.9462637901306152, + "learning_rate": 4.974662471229727e-07, + "loss": 0.0336, + "step": 1298 + }, + { + "epoch": 7.969325153374233, + "grad_norm": 1.9697695970535278, + "learning_rate": 4.945854548823425e-07, + "loss": 0.0049, + "step": 1299 + }, + { + "epoch": 7.975460122699387, + "grad_norm": 1.066036581993103, + "learning_rate": 4.917121121663823e-07, + "loss": 0.0103, + "step": 1300 + }, + { + "epoch": 7.9815950920245395, + "grad_norm": 1.0865890979766846, + "learning_rate": 4.888462296487129e-07, + "loss": 0.0036, + "step": 1301 + }, + { + "epoch": 7.987730061349693, + "grad_norm": 1.7804820537567139, + "learning_rate": 4.859878179752448e-07, + "loss": 0.0119, + "step": 1302 + }, + { + "epoch": 7.993865030674847, + "grad_norm": 2.735875129699707, + "learning_rate": 4.83136887764136e-07, + "loss": 0.0365, + "step": 1303 + }, + { + "epoch": 8.0, + "grad_norm": 1.316243290901184, + "learning_rate": 4.802934496057527e-07, + "loss": 0.0046, + "step": 1304 + }, + { + "epoch": 8.006134969325153, + "grad_norm": 2.192969560623169, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0235, + "step": 1305 + }, + { + "epoch": 8.012269938650308, + "grad_norm": 0.9257994890213013, + "learning_rate": 4.746290916694368e-07, + "loss": 0.0029, + "step": 1306 + }, + { + "epoch": 8.01840490797546, + "grad_norm": 0.6933830380439758, + "learning_rate": 4.71808192932926e-07, + "loss": 0.0019, + "step": 1307 + }, + { + "epoch": 8.024539877300613, + "grad_norm": 0.4838462173938751, + "learning_rate": 4.6899482833190765e-07, + "loss": 0.0024, + "step": 1308 + }, + { + "epoch": 8.030674846625766, + "grad_norm": 1.1725589036941528, + "learning_rate": 4.661890083172019e-07, + "loss": 0.0166, + "step": 1309 + }, + { + "epoch": 8.036809815950921, + "grad_norm": 0.7732264399528503, + "learning_rate": 4.633907433116053e-07, + "loss": 0.0047, + "step": 1310 + }, + { + "epoch": 8.042944785276074, + "grad_norm": 0.6369810700416565, + "learning_rate": 4.6060004370984763e-07, + "loss": 0.0013, + "step": 1311 + }, + { + "epoch": 8.049079754601227, + "grad_norm": 0.6437183618545532, + "learning_rate": 4.5781691987855676e-07, + "loss": 0.0016, + "step": 1312 + }, + { + "epoch": 8.05521472392638, + "grad_norm": 0.40145647525787354, + "learning_rate": 4.5504138215621915e-07, + "loss": 0.0026, + "step": 1313 + }, + { + "epoch": 8.061349693251534, + "grad_norm": 1.1000946760177612, + "learning_rate": 4.5227344085313873e-07, + "loss": 0.002, + "step": 1314 + }, + { + "epoch": 8.067484662576687, + "grad_norm": 1.4580782651901245, + "learning_rate": 4.495131062514038e-07, + "loss": 0.0299, + "step": 1315 + }, + { + "epoch": 8.07361963190184, + "grad_norm": 0.9026187062263489, + "learning_rate": 4.467603886048452e-07, + "loss": 0.003, + "step": 1316 + }, + { + "epoch": 8.079754601226995, + "grad_norm": 1.2969629764556885, + "learning_rate": 4.440152981389972e-07, + "loss": 0.0129, + "step": 1317 + }, + { + "epoch": 8.085889570552148, + "grad_norm": 0.837665319442749, + "learning_rate": 4.412778450510641e-07, + "loss": 0.0086, + "step": 1318 + }, + { + "epoch": 8.0920245398773, + "grad_norm": 0.3426748216152191, + "learning_rate": 4.3854803950987736e-07, + "loss": 0.002, + "step": 1319 + }, + { + "epoch": 8.098159509202453, + "grad_norm": 0.8508721590042114, + "learning_rate": 4.358258916558611e-07, + "loss": 0.0016, + "step": 1320 + }, + { + "epoch": 8.104294478527608, + "grad_norm": 1.2476134300231934, + "learning_rate": 4.331114116009938e-07, + "loss": 0.0156, + "step": 1321 + }, + { + "epoch": 8.110429447852761, + "grad_norm": 1.036689281463623, + "learning_rate": 4.3040460942876896e-07, + "loss": 0.0021, + "step": 1322 + }, + { + "epoch": 8.116564417177914, + "grad_norm": 0.7747099995613098, + "learning_rate": 4.277054951941609e-07, + "loss": 0.0021, + "step": 1323 + }, + { + "epoch": 8.122699386503067, + "grad_norm": 1.2793506383895874, + "learning_rate": 4.250140789235829e-07, + "loss": 0.0036, + "step": 1324 + }, + { + "epoch": 8.128834355828221, + "grad_norm": 1.5389785766601562, + "learning_rate": 4.223303706148549e-07, + "loss": 0.0031, + "step": 1325 + }, + { + "epoch": 8.134969325153374, + "grad_norm": 1.549869179725647, + "learning_rate": 4.196543802371641e-07, + "loss": 0.0102, + "step": 1326 + }, + { + "epoch": 8.141104294478527, + "grad_norm": 0.862311065196991, + "learning_rate": 4.1698611773102525e-07, + "loss": 0.0023, + "step": 1327 + }, + { + "epoch": 8.14723926380368, + "grad_norm": 1.0216046571731567, + "learning_rate": 4.14325593008249e-07, + "loss": 0.0074, + "step": 1328 + }, + { + "epoch": 8.153374233128835, + "grad_norm": 0.8307499289512634, + "learning_rate": 4.1167281595190206e-07, + "loss": 0.0017, + "step": 1329 + }, + { + "epoch": 8.159509202453988, + "grad_norm": 0.5344944596290588, + "learning_rate": 4.090277964162692e-07, + "loss": 0.0013, + "step": 1330 + }, + { + "epoch": 8.16564417177914, + "grad_norm": 0.8608856201171875, + "learning_rate": 4.063905442268201e-07, + "loss": 0.0014, + "step": 1331 + }, + { + "epoch": 8.171779141104295, + "grad_norm": 0.33019620180130005, + "learning_rate": 4.037610691801694e-07, + "loss": 0.0009, + "step": 1332 + }, + { + "epoch": 8.177914110429448, + "grad_norm": 0.6515982747077942, + "learning_rate": 4.011393810440431e-07, + "loss": 0.0022, + "step": 1333 + }, + { + "epoch": 8.184049079754601, + "grad_norm": 0.9144461750984192, + "learning_rate": 3.985254895572413e-07, + "loss": 0.0024, + "step": 1334 + }, + { + "epoch": 8.190184049079754, + "grad_norm": 0.4078105390071869, + "learning_rate": 3.959194044296011e-07, + "loss": 0.0011, + "step": 1335 + }, + { + "epoch": 8.196319018404909, + "grad_norm": 0.7559608817100525, + "learning_rate": 3.9332113534196194e-07, + "loss": 0.0028, + "step": 1336 + }, + { + "epoch": 8.202453987730062, + "grad_norm": 1.3025604486465454, + "learning_rate": 3.907306919461279e-07, + "loss": 0.0228, + "step": 1337 + }, + { + "epoch": 8.208588957055214, + "grad_norm": 0.6984004974365234, + "learning_rate": 3.8814808386483385e-07, + "loss": 0.0027, + "step": 1338 + }, + { + "epoch": 8.214723926380367, + "grad_norm": 1.161498785018921, + "learning_rate": 3.855733206917095e-07, + "loss": 0.0037, + "step": 1339 + }, + { + "epoch": 8.220858895705522, + "grad_norm": 0.5357164740562439, + "learning_rate": 3.8300641199124024e-07, + "loss": 0.0011, + "step": 1340 + }, + { + "epoch": 8.226993865030675, + "grad_norm": 0.8089649677276611, + "learning_rate": 3.80447367298738e-07, + "loss": 0.0008, + "step": 1341 + }, + { + "epoch": 8.233128834355828, + "grad_norm": 0.4289240539073944, + "learning_rate": 3.77896196120299e-07, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 8.239263803680982, + "grad_norm": 0.8666973114013672, + "learning_rate": 3.7535290793277364e-07, + "loss": 0.0047, + "step": 1343 + }, + { + "epoch": 8.245398773006135, + "grad_norm": 0.6841573715209961, + "learning_rate": 3.7281751218372965e-07, + "loss": 0.0007, + "step": 1344 + }, + { + "epoch": 8.251533742331288, + "grad_norm": 0.5588045716285706, + "learning_rate": 3.7029001829141457e-07, + "loss": 0.0018, + "step": 1345 + }, + { + "epoch": 8.257668711656441, + "grad_norm": 1.7257133722305298, + "learning_rate": 3.677704356447254e-07, + "loss": 0.0213, + "step": 1346 + }, + { + "epoch": 8.263803680981596, + "grad_norm": 0.2352600246667862, + "learning_rate": 3.6525877360316875e-07, + "loss": 0.0009, + "step": 1347 + }, + { + "epoch": 8.269938650306749, + "grad_norm": 0.9622183442115784, + "learning_rate": 3.627550414968303e-07, + "loss": 0.0132, + "step": 1348 + }, + { + "epoch": 8.276073619631902, + "grad_norm": 0.5367354154586792, + "learning_rate": 3.6025924862633814e-07, + "loss": 0.0006, + "step": 1349 + }, + { + "epoch": 8.282208588957054, + "grad_norm": 1.5134315490722656, + "learning_rate": 3.577714042628272e-07, + "loss": 0.01, + "step": 1350 + }, + { + "epoch": 8.28834355828221, + "grad_norm": 1.5052622556686401, + "learning_rate": 3.5529151764790715e-07, + "loss": 0.0031, + "step": 1351 + }, + { + "epoch": 8.294478527607362, + "grad_norm": 0.8776562809944153, + "learning_rate": 3.5281959799362775e-07, + "loss": 0.0053, + "step": 1352 + }, + { + "epoch": 8.300613496932515, + "grad_norm": 0.7919799089431763, + "learning_rate": 3.503556544824413e-07, + "loss": 0.0021, + "step": 1353 + }, + { + "epoch": 8.30674846625767, + "grad_norm": 0.7141364216804504, + "learning_rate": 3.4789969626717377e-07, + "loss": 0.0019, + "step": 1354 + }, + { + "epoch": 8.312883435582823, + "grad_norm": 1.7783756256103516, + "learning_rate": 3.454517324709858e-07, + "loss": 0.0019, + "step": 1355 + }, + { + "epoch": 8.319018404907975, + "grad_norm": 0.9534929394721985, + "learning_rate": 3.43011772187343e-07, + "loss": 0.0011, + "step": 1356 + }, + { + "epoch": 8.325153374233128, + "grad_norm": 0.4383384585380554, + "learning_rate": 3.405798244799799e-07, + "loss": 0.0006, + "step": 1357 + }, + { + "epoch": 8.331288343558283, + "grad_norm": 0.8582566976547241, + "learning_rate": 3.3815589838286535e-07, + "loss": 0.002, + "step": 1358 + }, + { + "epoch": 8.337423312883436, + "grad_norm": 0.8288223743438721, + "learning_rate": 3.3574000290017174e-07, + "loss": 0.002, + "step": 1359 + }, + { + "epoch": 8.343558282208589, + "grad_norm": 1.2074549198150635, + "learning_rate": 3.3333214700623976e-07, + "loss": 0.0153, + "step": 1360 + }, + { + "epoch": 8.349693251533742, + "grad_norm": 0.5359098315238953, + "learning_rate": 3.3093233964554464e-07, + "loss": 0.0014, + "step": 1361 + }, + { + "epoch": 8.355828220858896, + "grad_norm": 1.6650397777557373, + "learning_rate": 3.2854058973266547e-07, + "loss": 0.0107, + "step": 1362 + }, + { + "epoch": 8.36196319018405, + "grad_norm": 1.1784273386001587, + "learning_rate": 3.261569061522474e-07, + "loss": 0.0197, + "step": 1363 + }, + { + "epoch": 8.368098159509202, + "grad_norm": 0.6566861271858215, + "learning_rate": 3.237812977589738e-07, + "loss": 0.0009, + "step": 1364 + }, + { + "epoch": 8.374233128834355, + "grad_norm": 0.9043551683425903, + "learning_rate": 3.2141377337753105e-07, + "loss": 0.0026, + "step": 1365 + }, + { + "epoch": 8.38036809815951, + "grad_norm": 2.205872058868408, + "learning_rate": 3.190543418025749e-07, + "loss": 0.0533, + "step": 1366 + }, + { + "epoch": 8.386503067484663, + "grad_norm": 0.2918683886528015, + "learning_rate": 3.167030117986994e-07, + "loss": 0.0007, + "step": 1367 + }, + { + "epoch": 8.392638036809815, + "grad_norm": 0.5370535850524902, + "learning_rate": 3.143597921004027e-07, + "loss": 0.001, + "step": 1368 + }, + { + "epoch": 8.39877300613497, + "grad_norm": 1.353083610534668, + "learning_rate": 3.120246914120564e-07, + "loss": 0.002, + "step": 1369 + }, + { + "epoch": 8.404907975460123, + "grad_norm": 0.644607424736023, + "learning_rate": 3.096977184078731e-07, + "loss": 0.0025, + "step": 1370 + }, + { + "epoch": 8.411042944785276, + "grad_norm": 0.7351365089416504, + "learning_rate": 3.0737888173187067e-07, + "loss": 0.0014, + "step": 1371 + }, + { + "epoch": 8.417177914110429, + "grad_norm": 1.161787748336792, + "learning_rate": 3.050681899978464e-07, + "loss": 0.0149, + "step": 1372 + }, + { + "epoch": 8.423312883435583, + "grad_norm": 1.7568200826644897, + "learning_rate": 3.0276565178933847e-07, + "loss": 0.0178, + "step": 1373 + }, + { + "epoch": 8.429447852760736, + "grad_norm": 0.73989337682724, + "learning_rate": 3.004712756595993e-07, + "loss": 0.0053, + "step": 1374 + }, + { + "epoch": 8.43558282208589, + "grad_norm": 1.8425425291061401, + "learning_rate": 2.9818507013156085e-07, + "loss": 0.0013, + "step": 1375 + }, + { + "epoch": 8.441717791411042, + "grad_norm": 0.6374561786651611, + "learning_rate": 2.9590704369780313e-07, + "loss": 0.0039, + "step": 1376 + }, + { + "epoch": 8.447852760736197, + "grad_norm": 0.708151638507843, + "learning_rate": 2.9363720482052436e-07, + "loss": 0.0025, + "step": 1377 + }, + { + "epoch": 8.45398773006135, + "grad_norm": 1.2846306562423706, + "learning_rate": 2.91375561931507e-07, + "loss": 0.0033, + "step": 1378 + }, + { + "epoch": 8.460122699386503, + "grad_norm": 0.347720742225647, + "learning_rate": 2.89122123432089e-07, + "loss": 0.0006, + "step": 1379 + }, + { + "epoch": 8.466257668711656, + "grad_norm": 0.9626922607421875, + "learning_rate": 2.868768976931313e-07, + "loss": 0.001, + "step": 1380 + }, + { + "epoch": 8.47239263803681, + "grad_norm": 0.26909729838371277, + "learning_rate": 2.8463989305498596e-07, + "loss": 0.0008, + "step": 1381 + }, + { + "epoch": 8.478527607361963, + "grad_norm": 0.8750791549682617, + "learning_rate": 2.824111178274669e-07, + "loss": 0.0025, + "step": 1382 + }, + { + "epoch": 8.484662576687116, + "grad_norm": 1.1124992370605469, + "learning_rate": 2.801905802898183e-07, + "loss": 0.0031, + "step": 1383 + }, + { + "epoch": 8.49079754601227, + "grad_norm": 0.4871549904346466, + "learning_rate": 2.779782886906829e-07, + "loss": 0.0013, + "step": 1384 + }, + { + "epoch": 8.496932515337424, + "grad_norm": 0.5207282900810242, + "learning_rate": 2.7577425124807324e-07, + "loss": 0.0013, + "step": 1385 + }, + { + "epoch": 8.503067484662576, + "grad_norm": 1.8369935750961304, + "learning_rate": 2.7357847614933876e-07, + "loss": 0.0031, + "step": 1386 + }, + { + "epoch": 8.50920245398773, + "grad_norm": 0.6390517354011536, + "learning_rate": 2.713909715511384e-07, + "loss": 0.0045, + "step": 1387 + }, + { + "epoch": 8.515337423312884, + "grad_norm": 0.8618245124816895, + "learning_rate": 2.692117455794077e-07, + "loss": 0.0017, + "step": 1388 + }, + { + "epoch": 8.521472392638037, + "grad_norm": 0.8506134152412415, + "learning_rate": 2.6704080632932895e-07, + "loss": 0.0014, + "step": 1389 + }, + { + "epoch": 8.52760736196319, + "grad_norm": 0.42547252774238586, + "learning_rate": 2.6487816186530263e-07, + "loss": 0.002, + "step": 1390 + }, + { + "epoch": 8.533742331288344, + "grad_norm": 0.6425843834877014, + "learning_rate": 2.6272382022091704e-07, + "loss": 0.0028, + "step": 1391 + }, + { + "epoch": 8.539877300613497, + "grad_norm": 0.8287162780761719, + "learning_rate": 2.6057778939891614e-07, + "loss": 0.011, + "step": 1392 + }, + { + "epoch": 8.54601226993865, + "grad_norm": 1.0402963161468506, + "learning_rate": 2.584400773711737e-07, + "loss": 0.0037, + "step": 1393 + }, + { + "epoch": 8.552147239263803, + "grad_norm": 0.9785431623458862, + "learning_rate": 2.5631069207865926e-07, + "loss": 0.0023, + "step": 1394 + }, + { + "epoch": 8.558282208588958, + "grad_norm": 1.2661131620407104, + "learning_rate": 2.541896414314132e-07, + "loss": 0.0053, + "step": 1395 + }, + { + "epoch": 8.56441717791411, + "grad_norm": 0.2662440240383148, + "learning_rate": 2.520769333085141e-07, + "loss": 0.0008, + "step": 1396 + }, + { + "epoch": 8.570552147239264, + "grad_norm": 0.628510594367981, + "learning_rate": 2.4997257555805064e-07, + "loss": 0.001, + "step": 1397 + }, + { + "epoch": 8.576687116564417, + "grad_norm": 1.08578622341156, + "learning_rate": 2.4787657599709276e-07, + "loss": 0.0041, + "step": 1398 + }, + { + "epoch": 8.582822085889571, + "grad_norm": 0.8213603496551514, + "learning_rate": 2.4578894241166135e-07, + "loss": 0.0029, + "step": 1399 + }, + { + "epoch": 8.588957055214724, + "grad_norm": 0.5261257886886597, + "learning_rate": 2.4370968255670093e-07, + "loss": 0.001, + "step": 1400 + }, + { + "epoch": 8.595092024539877, + "grad_norm": 0.18139345943927765, + "learning_rate": 2.4163880415604913e-07, + "loss": 0.0005, + "step": 1401 + }, + { + "epoch": 8.60122699386503, + "grad_norm": 0.8317165970802307, + "learning_rate": 2.395763149024102e-07, + "loss": 0.0034, + "step": 1402 + }, + { + "epoch": 8.607361963190185, + "grad_norm": 1.272074580192566, + "learning_rate": 2.3752222245732454e-07, + "loss": 0.0036, + "step": 1403 + }, + { + "epoch": 8.613496932515337, + "grad_norm": 0.5556488633155823, + "learning_rate": 2.3547653445114032e-07, + "loss": 0.0013, + "step": 1404 + }, + { + "epoch": 8.61963190184049, + "grad_norm": 0.6546408534049988, + "learning_rate": 2.334392584829867e-07, + "loss": 0.0008, + "step": 1405 + }, + { + "epoch": 8.625766871165645, + "grad_norm": 2.021836996078491, + "learning_rate": 2.3141040212074445e-07, + "loss": 0.0198, + "step": 1406 + }, + { + "epoch": 8.631901840490798, + "grad_norm": 0.6017210483551025, + "learning_rate": 2.293899729010171e-07, + "loss": 0.0033, + "step": 1407 + }, + { + "epoch": 8.63803680981595, + "grad_norm": 0.315134733915329, + "learning_rate": 2.2737797832910498e-07, + "loss": 0.0007, + "step": 1408 + }, + { + "epoch": 8.644171779141104, + "grad_norm": 0.7090817093849182, + "learning_rate": 2.2537442587897474e-07, + "loss": 0.0045, + "step": 1409 + }, + { + "epoch": 8.650306748466258, + "grad_norm": 0.26951614022254944, + "learning_rate": 2.2337932299323434e-07, + "loss": 0.001, + "step": 1410 + }, + { + "epoch": 8.656441717791411, + "grad_norm": 0.21670447289943695, + "learning_rate": 2.2139267708310457e-07, + "loss": 0.0005, + "step": 1411 + }, + { + "epoch": 8.662576687116564, + "grad_norm": 1.070379376411438, + "learning_rate": 2.194144955283886e-07, + "loss": 0.0022, + "step": 1412 + }, + { + "epoch": 8.668711656441717, + "grad_norm": 0.7644438147544861, + "learning_rate": 2.1744478567744947e-07, + "loss": 0.0023, + "step": 1413 + }, + { + "epoch": 8.674846625766872, + "grad_norm": 1.053305983543396, + "learning_rate": 2.154835548471798e-07, + "loss": 0.0027, + "step": 1414 + }, + { + "epoch": 8.680981595092025, + "grad_norm": 0.5719135403633118, + "learning_rate": 2.1353081032297356e-07, + "loss": 0.0015, + "step": 1415 + }, + { + "epoch": 8.687116564417177, + "grad_norm": 0.3360785245895386, + "learning_rate": 2.1158655935870325e-07, + "loss": 0.0025, + "step": 1416 + }, + { + "epoch": 8.69325153374233, + "grad_norm": 0.867242693901062, + "learning_rate": 2.0965080917668744e-07, + "loss": 0.002, + "step": 1417 + }, + { + "epoch": 8.699386503067485, + "grad_norm": 1.1389360427856445, + "learning_rate": 2.077235669676689e-07, + "loss": 0.0023, + "step": 1418 + }, + { + "epoch": 8.705521472392638, + "grad_norm": 0.31157732009887695, + "learning_rate": 2.0580483989078525e-07, + "loss": 0.0005, + "step": 1419 + }, + { + "epoch": 8.71165644171779, + "grad_norm": 1.328353762626648, + "learning_rate": 2.0389463507354211e-07, + "loss": 0.0122, + "step": 1420 + }, + { + "epoch": 8.717791411042946, + "grad_norm": 0.13456307351589203, + "learning_rate": 2.0199295961178893e-07, + "loss": 0.0005, + "step": 1421 + }, + { + "epoch": 8.723926380368098, + "grad_norm": 0.7963683605194092, + "learning_rate": 2.000998205696894e-07, + "loss": 0.004, + "step": 1422 + }, + { + "epoch": 8.730061349693251, + "grad_norm": 0.1814875602722168, + "learning_rate": 1.9821522497969813e-07, + "loss": 0.0004, + "step": 1423 + }, + { + "epoch": 8.736196319018404, + "grad_norm": 0.4806751012802124, + "learning_rate": 1.9633917984253294e-07, + "loss": 0.001, + "step": 1424 + }, + { + "epoch": 8.742331288343559, + "grad_norm": 0.6554126143455505, + "learning_rate": 1.944716921271489e-07, + "loss": 0.0019, + "step": 1425 + }, + { + "epoch": 8.748466257668712, + "grad_norm": 0.7839532494544983, + "learning_rate": 1.9261276877071354e-07, + "loss": 0.0055, + "step": 1426 + }, + { + "epoch": 8.754601226993865, + "grad_norm": 1.1153522729873657, + "learning_rate": 1.9076241667857988e-07, + "loss": 0.0048, + "step": 1427 + }, + { + "epoch": 8.76073619631902, + "grad_norm": 1.4735853672027588, + "learning_rate": 1.8892064272426042e-07, + "loss": 0.0079, + "step": 1428 + }, + { + "epoch": 8.766871165644172, + "grad_norm": 0.9770727157592773, + "learning_rate": 1.8708745374940469e-07, + "loss": 0.0013, + "step": 1429 + }, + { + "epoch": 8.773006134969325, + "grad_norm": 1.5710560083389282, + "learning_rate": 1.8526285656376873e-07, + "loss": 0.0046, + "step": 1430 + }, + { + "epoch": 8.779141104294478, + "grad_norm": 0.9026464819908142, + "learning_rate": 1.8344685794519507e-07, + "loss": 0.006, + "step": 1431 + }, + { + "epoch": 8.785276073619633, + "grad_norm": 1.2195831537246704, + "learning_rate": 1.8163946463958276e-07, + "loss": 0.0094, + "step": 1432 + }, + { + "epoch": 8.791411042944786, + "grad_norm": 0.31636637449264526, + "learning_rate": 1.7984068336086652e-07, + "loss": 0.0009, + "step": 1433 + }, + { + "epoch": 8.797546012269938, + "grad_norm": 0.5591960549354553, + "learning_rate": 1.780505207909894e-07, + "loss": 0.0014, + "step": 1434 + }, + { + "epoch": 8.803680981595091, + "grad_norm": 0.5905728340148926, + "learning_rate": 1.7626898357987782e-07, + "loss": 0.0013, + "step": 1435 + }, + { + "epoch": 8.809815950920246, + "grad_norm": 1.0983483791351318, + "learning_rate": 1.744960783454186e-07, + "loss": 0.0024, + "step": 1436 + }, + { + "epoch": 8.815950920245399, + "grad_norm": 0.7398350238800049, + "learning_rate": 1.727318116734328e-07, + "loss": 0.0015, + "step": 1437 + }, + { + "epoch": 8.822085889570552, + "grad_norm": 0.4621620774269104, + "learning_rate": 1.7097619011765127e-07, + "loss": 0.0017, + "step": 1438 + }, + { + "epoch": 8.828220858895705, + "grad_norm": 0.8077200055122375, + "learning_rate": 1.6922922019969145e-07, + "loss": 0.0009, + "step": 1439 + }, + { + "epoch": 8.83435582822086, + "grad_norm": 0.7134829163551331, + "learning_rate": 1.6749090840903233e-07, + "loss": 0.0013, + "step": 1440 + }, + { + "epoch": 8.840490797546012, + "grad_norm": 1.2837457656860352, + "learning_rate": 1.6576126120299046e-07, + "loss": 0.0029, + "step": 1441 + }, + { + "epoch": 8.846625766871165, + "grad_norm": 0.8713163137435913, + "learning_rate": 1.6404028500669633e-07, + "loss": 0.0034, + "step": 1442 + }, + { + "epoch": 8.85276073619632, + "grad_norm": 0.5622571706771851, + "learning_rate": 1.6232798621306918e-07, + "loss": 0.0022, + "step": 1443 + }, + { + "epoch": 8.858895705521473, + "grad_norm": 2.460902214050293, + "learning_rate": 1.606243711827951e-07, + "loss": 0.0329, + "step": 1444 + }, + { + "epoch": 8.865030674846626, + "grad_norm": 1.5952033996582031, + "learning_rate": 1.5892944624430334e-07, + "loss": 0.0092, + "step": 1445 + }, + { + "epoch": 8.871165644171779, + "grad_norm": 0.16087445616722107, + "learning_rate": 1.5724321769374023e-07, + "loss": 0.0005, + "step": 1446 + }, + { + "epoch": 8.877300613496933, + "grad_norm": 0.33085283637046814, + "learning_rate": 1.5556569179494857e-07, + "loss": 0.0005, + "step": 1447 + }, + { + "epoch": 8.883435582822086, + "grad_norm": 0.15866753458976746, + "learning_rate": 1.538968747794431e-07, + "loss": 0.0004, + "step": 1448 + }, + { + "epoch": 8.889570552147239, + "grad_norm": 1.0744353532791138, + "learning_rate": 1.5223677284638805e-07, + "loss": 0.0046, + "step": 1449 + }, + { + "epoch": 8.895705521472392, + "grad_norm": 0.8372928500175476, + "learning_rate": 1.5058539216257356e-07, + "loss": 0.0048, + "step": 1450 + }, + { + "epoch": 8.901840490797547, + "grad_norm": 1.0015332698822021, + "learning_rate": 1.4894273886239208e-07, + "loss": 0.0027, + "step": 1451 + }, + { + "epoch": 8.9079754601227, + "grad_norm": 1.1478570699691772, + "learning_rate": 1.473088190478178e-07, + "loss": 0.0134, + "step": 1452 + }, + { + "epoch": 8.914110429447852, + "grad_norm": 0.8685131669044495, + "learning_rate": 1.4568363878838087e-07, + "loss": 0.0024, + "step": 1453 + }, + { + "epoch": 8.920245398773005, + "grad_norm": 0.46051493287086487, + "learning_rate": 1.4406720412114828e-07, + "loss": 0.0019, + "step": 1454 + }, + { + "epoch": 8.92638036809816, + "grad_norm": 0.75945645570755, + "learning_rate": 1.4245952105069905e-07, + "loss": 0.0015, + "step": 1455 + }, + { + "epoch": 8.932515337423313, + "grad_norm": 1.2880934476852417, + "learning_rate": 1.4086059554910186e-07, + "loss": 0.0045, + "step": 1456 + }, + { + "epoch": 8.938650306748466, + "grad_norm": 0.2242523580789566, + "learning_rate": 1.3927043355589476e-07, + "loss": 0.0011, + "step": 1457 + }, + { + "epoch": 8.94478527607362, + "grad_norm": 1.0341970920562744, + "learning_rate": 1.3768904097806153e-07, + "loss": 0.0019, + "step": 1458 + }, + { + "epoch": 8.950920245398773, + "grad_norm": 0.8955618739128113, + "learning_rate": 1.361164236900092e-07, + "loss": 0.0027, + "step": 1459 + }, + { + "epoch": 8.957055214723926, + "grad_norm": 1.3581833839416504, + "learning_rate": 1.3455258753354932e-07, + "loss": 0.0048, + "step": 1460 + }, + { + "epoch": 8.963190184049079, + "grad_norm": 1.5094419717788696, + "learning_rate": 1.3299753831787193e-07, + "loss": 0.0011, + "step": 1461 + }, + { + "epoch": 8.969325153374234, + "grad_norm": 0.5978104472160339, + "learning_rate": 1.3145128181952737e-07, + "loss": 0.0018, + "step": 1462 + }, + { + "epoch": 8.975460122699387, + "grad_norm": 0.7072922587394714, + "learning_rate": 1.2991382378240325e-07, + "loss": 0.0032, + "step": 1463 + }, + { + "epoch": 8.98159509202454, + "grad_norm": 0.5541467666625977, + "learning_rate": 1.2838516991770355e-07, + "loss": 0.001, + "step": 1464 + }, + { + "epoch": 8.987730061349692, + "grad_norm": 0.6946907043457031, + "learning_rate": 1.2686532590392763e-07, + "loss": 0.0024, + "step": 1465 + }, + { + "epoch": 8.993865030674847, + "grad_norm": 0.3228455185890198, + "learning_rate": 1.2535429738684822e-07, + "loss": 0.0007, + "step": 1466 + }, + { + "epoch": 9.0, + "grad_norm": 2.4403252601623535, + "learning_rate": 1.238520899794915e-07, + "loss": 0.0245, + "step": 1467 + } + ], + "logging_steps": 1, + "max_steps": 1630, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.632019917168968e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6d1378e534430bd764af1f01625cf0b940470592 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json @@ -0,0 +1,1175 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 163, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 5.908512115478516, + "learning_rate": 5e-06, + "loss": 0.9606, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 4.304474353790283, + "learning_rate": 4.999995356617983e-06, + "loss": 0.8609, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 5.63697624206543, + "learning_rate": 4.999981426489179e-06, + "loss": 1.3543, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 3.6674246788024902, + "learning_rate": 4.999958209665336e-06, + "loss": 0.787, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 48.14854431152344, + "learning_rate": 4.999925706232695e-06, + "loss": 1.7786, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 7.8689866065979, + "learning_rate": 4.999883916312e-06, + "loss": 1.2175, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 5.119968891143799, + "learning_rate": 4.9998328400584864e-06, + "loss": 0.8998, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 3.730757713317871, + "learning_rate": 4.999772477661888e-06, + "loss": 0.8419, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 27.314565658569336, + "learning_rate": 4.999702829346432e-06, + "loss": 1.7948, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.822697162628174, + "learning_rate": 4.999623895370843e-06, + "loss": 1.0461, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 4.71220588684082, + "learning_rate": 4.999535676028338e-06, + "loss": 1.0, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.2378087043762207, + "learning_rate": 4.999438171646624e-06, + "loss": 0.9475, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 3.475543737411499, + "learning_rate": 4.999331382587901e-06, + "loss": 0.8654, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 10.06365966796875, + "learning_rate": 4.999215309248861e-06, + "loss": 1.2042, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 3.785153865814209, + "learning_rate": 4.999089952060681e-06, + "loss": 0.8846, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 2.944488048553467, + "learning_rate": 4.998955311489025e-06, + "loss": 0.8805, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 39.89304733276367, + "learning_rate": 4.998811388034046e-06, + "loss": 1.5882, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 3.5883963108062744, + "learning_rate": 4.9986581822303746e-06, + "loss": 0.9222, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 6.972247123718262, + "learning_rate": 4.998495694647127e-06, + "loss": 1.4088, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 3.948991298675537, + "learning_rate": 4.998323925887895e-06, + "loss": 1.454, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 3.8690035343170166, + "learning_rate": 4.998142876590749e-06, + "loss": 0.6335, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 5.243765830993652, + "learning_rate": 4.997952547428236e-06, + "loss": 0.6725, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 3.5994043350219727, + "learning_rate": 4.997752939107372e-06, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 4.06965970993042, + "learning_rate": 4.997544052369642e-06, + "loss": 0.9683, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 3.3247246742248535, + "learning_rate": 4.997325887990999e-06, + "loss": 0.9414, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 5.811742782592773, + "learning_rate": 4.997098446781861e-06, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 2.661334753036499, + "learning_rate": 4.996861729587103e-06, + "loss": 0.7708, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.863943576812744, + "learning_rate": 4.996615737286061e-06, + "loss": 0.6995, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 20.376733779907227, + "learning_rate": 4.996360470792524e-06, + "loss": 1.2563, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 3.62265682220459, + "learning_rate": 4.996095931054731e-06, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 3.915076732635498, + "learning_rate": 4.9958221190553705e-06, + "loss": 0.9227, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 3.129855155944824, + "learning_rate": 4.995539035811572e-06, + "loss": 0.701, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 2.7532224655151367, + "learning_rate": 4.9952466823749076e-06, + "loss": 0.6491, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 2.8444128036499023, + "learning_rate": 4.9949450598313835e-06, + "loss": 0.8029, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.57743239402771, + "learning_rate": 4.994634169301439e-06, + "loss": 0.8785, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 3.280055284500122, + "learning_rate": 4.994314011939941e-06, + "loss": 1.034, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 2.455838680267334, + "learning_rate": 4.99398458893618e-06, + "loss": 0.8557, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 4.72681188583374, + "learning_rate": 4.993645901513865e-06, + "loss": 1.1904, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 3.0585641860961914, + "learning_rate": 4.993297950931121e-06, + "loss": 0.7668, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.4603540897369385, + "learning_rate": 4.9929407384804806e-06, + "loss": 0.8812, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.9702436923980713, + "learning_rate": 4.992574265488883e-06, + "loss": 0.8878, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 2.6973602771759033, + "learning_rate": 4.9921985333176694e-06, + "loss": 0.7251, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 2.5542335510253906, + "learning_rate": 4.991813543362572e-06, + "loss": 0.6638, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.7530782222747803, + "learning_rate": 4.991419297053716e-06, + "loss": 1.0725, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 2.6483025550842285, + "learning_rate": 4.991015795855611e-06, + "loss": 0.7238, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 3.434422492980957, + "learning_rate": 4.990603041267144e-06, + "loss": 0.9188, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.914340019226074, + "learning_rate": 4.990181034821578e-06, + "loss": 0.6158, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 2.7211625576019287, + "learning_rate": 4.98974977808654e-06, + "loss": 0.7165, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.8414249420166016, + "learning_rate": 4.989309272664026e-06, + "loss": 0.7277, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 3.683204412460327, + "learning_rate": 4.988859520190381e-06, + "loss": 0.9793, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 3.1732583045959473, + "learning_rate": 4.988400522336304e-06, + "loss": 0.8966, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 2.7789194583892822, + "learning_rate": 4.9879322808068365e-06, + "loss": 0.8191, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.754816770553589, + "learning_rate": 4.987454797341358e-06, + "loss": 0.6308, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.730104684829712, + "learning_rate": 4.98696807371358e-06, + "loss": 0.8226, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 3.2225449085235596, + "learning_rate": 4.986472111731536e-06, + "loss": 0.9184, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 3.2684760093688965, + "learning_rate": 4.985966913237581e-06, + "loss": 0.6593, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.43105411529541, + "learning_rate": 4.985452480108376e-06, + "loss": 0.6994, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 7.366360664367676, + "learning_rate": 4.984928814254889e-06, + "loss": 1.1374, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.81864333152771, + "learning_rate": 4.984395917622387e-06, + "loss": 0.8097, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 3.1107730865478516, + "learning_rate": 4.9838537921904206e-06, + "loss": 0.8511, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 2.460545301437378, + "learning_rate": 4.9833024399728295e-06, + "loss": 0.898, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.921992778778076, + "learning_rate": 4.982741863017722e-06, + "loss": 0.6671, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 3.3006443977355957, + "learning_rate": 4.982172063407479e-06, + "loss": 1.0559, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.642587661743164, + "learning_rate": 4.9815930432587365e-06, + "loss": 0.6663, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.905898094177246, + "learning_rate": 4.981004804722384e-06, + "loss": 0.6895, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.9174182415008545, + "learning_rate": 4.980407349983556e-06, + "loss": 0.7982, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 2.214322805404663, + "learning_rate": 4.979800681261619e-06, + "loss": 0.6808, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 2.7152462005615234, + "learning_rate": 4.9791848008101705e-06, + "loss": 0.567, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.5657734870910645, + "learning_rate": 4.978559710917024e-06, + "loss": 0.7745, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 3.9103832244873047, + "learning_rate": 4.977925413904205e-06, + "loss": 0.9815, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 4.610236644744873, + "learning_rate": 4.9772819121279395e-06, + "loss": 1.164, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 3.01170015335083, + "learning_rate": 4.976629207978648e-06, + "loss": 0.7587, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 3.175889253616333, + "learning_rate": 4.975967303880933e-06, + "loss": 0.58, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 2.503741502761841, + "learning_rate": 4.975296202293575e-06, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 2.6778078079223633, + "learning_rate": 4.974615905709518e-06, + "loss": 0.7352, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 5.950812816619873, + "learning_rate": 4.973926416655863e-06, + "loss": 1.0643, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 3.0165305137634277, + "learning_rate": 4.973227737693858e-06, + "loss": 0.6699, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 4.793259620666504, + "learning_rate": 4.972519871418894e-06, + "loss": 1.0315, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 3.632815361022949, + "learning_rate": 4.971802820460481e-06, + "loss": 0.7003, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 3.077507734298706, + "learning_rate": 4.971076587482254e-06, + "loss": 0.6776, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 3.3886241912841797, + "learning_rate": 4.970341175181957e-06, + "loss": 0.7422, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 2.71288800239563, + "learning_rate": 4.969596586291425e-06, + "loss": 0.7471, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.777920961380005, + "learning_rate": 4.968842823576592e-06, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 6.496985912322998, + "learning_rate": 4.968079889837461e-06, + "loss": 0.9965, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.6163430213928223, + "learning_rate": 4.967307787908108e-06, + "loss": 0.6833, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 3.244098663330078, + "learning_rate": 4.966526520656663e-06, + "loss": 0.8373, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.9027860164642334, + "learning_rate": 4.965736090985305e-06, + "loss": 0.8529, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 2.3786230087280273, + "learning_rate": 4.964936501830246e-06, + "loss": 0.6577, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 7.3099045753479, + "learning_rate": 4.964127756161727e-06, + "loss": 1.1184, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 3.068873167037964, + "learning_rate": 4.963309856983998e-06, + "loss": 0.7906, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 3.082547426223755, + "learning_rate": 4.9624828073353144e-06, + "loss": 0.8107, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.4586973190307617, + "learning_rate": 4.961646610287922e-06, + "loss": 0.7421, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 2.779277801513672, + "learning_rate": 4.960801268948047e-06, + "loss": 0.7134, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 3.2255213260650635, + "learning_rate": 4.959946786455882e-06, + "loss": 0.5875, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 2.783395528793335, + "learning_rate": 4.959083165985581e-06, + "loss": 0.6595, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 2.240114212036133, + "learning_rate": 4.958210410745237e-06, + "loss": 0.793, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.9399421215057373, + "learning_rate": 4.957328523976879e-06, + "loss": 0.5896, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 3.4449355602264404, + "learning_rate": 4.956437508956458e-06, + "loss": 0.8658, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 4.273710250854492, + "learning_rate": 4.9555373689938325e-06, + "loss": 0.8316, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.4222047328948975, + "learning_rate": 4.954628107432757e-06, + "loss": 1.0613, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 2.5318963527679443, + "learning_rate": 4.95370972765087e-06, + "loss": 0.7194, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 2.7852585315704346, + "learning_rate": 4.952782233059683e-06, + "loss": 0.5927, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 2.6532323360443115, + "learning_rate": 4.951845627104565e-06, + "loss": 0.8505, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 2.3213467597961426, + "learning_rate": 4.95089991326473e-06, + "loss": 0.8682, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 2.607992649078369, + "learning_rate": 4.9499450950532305e-06, + "loss": 0.8735, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 3.9820072650909424, + "learning_rate": 4.94898117601693e-06, + "loss": 1.0571, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 3.3878824710845947, + "learning_rate": 4.948008159736507e-06, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 2.6935670375823975, + "learning_rate": 4.94702604982643e-06, + "loss": 0.5968, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.78190016746521, + "learning_rate": 4.9460348499349485e-06, + "loss": 0.7504, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 2.973083972930908, + "learning_rate": 4.945034563744077e-06, + "loss": 0.6728, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 2.631803512573242, + "learning_rate": 4.944025194969586e-06, + "loss": 0.609, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 2.7443883419036865, + "learning_rate": 4.9430067473609825e-06, + "loss": 0.8713, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 2.543769121170044, + "learning_rate": 4.941979224701499e-06, + "loss": 0.8035, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 3.7799901962280273, + "learning_rate": 4.94094263080808e-06, + "loss": 0.9341, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 3.1234734058380127, + "learning_rate": 4.939896969531367e-06, + "loss": 1.1066, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 2.356036424636841, + "learning_rate": 4.938842244755683e-06, + "loss": 0.853, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 3.6231274604797363, + "learning_rate": 4.937778460399022e-06, + "loss": 0.9116, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 3.1277005672454834, + "learning_rate": 4.936705620413028e-06, + "loss": 0.5888, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.7338361740112305, + "learning_rate": 4.935623728782986e-06, + "loss": 0.592, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 2.748363733291626, + "learning_rate": 4.934532789527805e-06, + "loss": 0.8713, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 4.460031986236572, + "learning_rate": 4.933432806700004e-06, + "loss": 0.6791, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 2.392911911010742, + "learning_rate": 4.932323784385693e-06, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.7804384231567383, + "learning_rate": 4.931205726704566e-06, + "loss": 0.7547, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.7664780616760254, + "learning_rate": 4.930078637809878e-06, + "loss": 0.7849, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 2.592808723449707, + "learning_rate": 4.928942521888431e-06, + "loss": 0.7015, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 2.7080585956573486, + "learning_rate": 4.927797383160561e-06, + "loss": 1.0028, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.7941503524780273, + "learning_rate": 4.926643225880123e-06, + "loss": 0.602, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 3.2796623706817627, + "learning_rate": 4.925480054334471e-06, + "loss": 0.7473, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 2.7623610496520996, + "learning_rate": 4.924307872844444e-06, + "loss": 1.0573, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.6224453449249268, + "learning_rate": 4.923126685764351e-06, + "loss": 0.7399, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 17.736326217651367, + "learning_rate": 4.921936497481956e-06, + "loss": 0.9548, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.504213333129883, + "learning_rate": 4.920737312418456e-06, + "loss": 0.6748, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 3.617077350616455, + "learning_rate": 4.919529135028473e-06, + "loss": 0.8431, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.6559832096099854, + "learning_rate": 4.918311969800027e-06, + "loss": 0.7243, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 2.7539305686950684, + "learning_rate": 4.917085821254532e-06, + "loss": 0.7845, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 3.3587615489959717, + "learning_rate": 4.915850693946766e-06, + "loss": 0.4891, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 3.064354181289673, + "learning_rate": 4.914606592464865e-06, + "loss": 0.7917, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 3.2505199909210205, + "learning_rate": 4.9133535214303e-06, + "loss": 0.9681, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 3.8027830123901367, + "learning_rate": 4.91209148549786e-06, + "loss": 0.9275, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 2.4154372215270996, + "learning_rate": 4.910820489355637e-06, + "loss": 0.7259, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.892462968826294, + "learning_rate": 4.909540537725007e-06, + "loss": 0.6061, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 3.3398196697235107, + "learning_rate": 4.908251635360616e-06, + "loss": 1.0559, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 3.022512197494507, + "learning_rate": 4.906953787050354e-06, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 2.658661365509033, + "learning_rate": 4.905646997615347e-06, + "loss": 0.6234, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 3.454400062561035, + "learning_rate": 4.904331271909932e-06, + "loss": 0.8066, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 3.1300277709960938, + "learning_rate": 4.903006614821645e-06, + "loss": 0.6861, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 2.362537145614624, + "learning_rate": 4.901673031271194e-06, + "loss": 0.6112, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 3.375577688217163, + "learning_rate": 4.900330526212451e-06, + "loss": 0.6314, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 2.955656051635742, + "learning_rate": 4.898979104632427e-06, + "loss": 0.889, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 2.9285926818847656, + "learning_rate": 4.897618771551255e-06, + "loss": 0.6406, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.131819725036621, + "learning_rate": 4.8962495320221714e-06, + "loss": 0.6368, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 2.780649185180664, + "learning_rate": 4.8948713911315e-06, + "loss": 0.8642, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 2.941500186920166, + "learning_rate": 4.8934843539986266e-06, + "loss": 0.714, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.892088425775986e-06, + "loss": 0.8365, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 2.6887171268463135, + "learning_rate": 4.890683611649041e-06, + "loss": 0.7937, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 3.7638463973999023, + "learning_rate": 4.8892699168362626e-06, + "loss": 0.7485, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.8132755756378174, + "learning_rate": 4.887847346589111e-06, + "loss": 0.6467, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.652247190475464, + "learning_rate": 4.886415906192015e-06, + "loss": 0.4651, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 2.5854647159576416, + "learning_rate": 4.884975600962355e-06, + "loss": 0.8756, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 3.1630544662475586, + "learning_rate": 4.883526436250441e-06, + "loss": 0.7339, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.84452748298645, + "learning_rate": 4.8820684174394935e-06, + "loss": 0.7808, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 3.604048490524292, + "learning_rate": 4.880601549945622e-06, + "loss": 0.96, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 2.302924871444702, + "learning_rate": 4.879125839217808e-06, + "loss": 0.8122, + "step": 163 + } + ], + "logging_steps": 1, + "max_steps": 1630, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.029784817192141e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..416b883af00c613c7b5bd2aee5c64ef495b9d29a --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json @@ -0,0 +1,2316 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 326, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 5.908512115478516, + "learning_rate": 5e-06, + "loss": 0.9606, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 4.304474353790283, + "learning_rate": 4.999995356617983e-06, + "loss": 0.8609, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 5.63697624206543, + "learning_rate": 4.999981426489179e-06, + "loss": 1.3543, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 3.6674246788024902, + "learning_rate": 4.999958209665336e-06, + "loss": 0.787, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 48.14854431152344, + "learning_rate": 4.999925706232695e-06, + "loss": 1.7786, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 7.8689866065979, + "learning_rate": 4.999883916312e-06, + "loss": 1.2175, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 5.119968891143799, + "learning_rate": 4.9998328400584864e-06, + "loss": 0.8998, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 3.730757713317871, + "learning_rate": 4.999772477661888e-06, + "loss": 0.8419, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 27.314565658569336, + "learning_rate": 4.999702829346432e-06, + "loss": 1.7948, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.822697162628174, + "learning_rate": 4.999623895370843e-06, + "loss": 1.0461, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 4.71220588684082, + "learning_rate": 4.999535676028338e-06, + "loss": 1.0, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.2378087043762207, + "learning_rate": 4.999438171646624e-06, + "loss": 0.9475, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 3.475543737411499, + "learning_rate": 4.999331382587901e-06, + "loss": 0.8654, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 10.06365966796875, + "learning_rate": 4.999215309248861e-06, + "loss": 1.2042, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 3.785153865814209, + "learning_rate": 4.999089952060681e-06, + "loss": 0.8846, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 2.944488048553467, + "learning_rate": 4.998955311489025e-06, + "loss": 0.8805, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 39.89304733276367, + "learning_rate": 4.998811388034046e-06, + "loss": 1.5882, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 3.5883963108062744, + "learning_rate": 4.9986581822303746e-06, + "loss": 0.9222, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 6.972247123718262, + "learning_rate": 4.998495694647127e-06, + "loss": 1.4088, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 3.948991298675537, + "learning_rate": 4.998323925887895e-06, + "loss": 1.454, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 3.8690035343170166, + "learning_rate": 4.998142876590749e-06, + "loss": 0.6335, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 5.243765830993652, + "learning_rate": 4.997952547428236e-06, + "loss": 0.6725, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 3.5994043350219727, + "learning_rate": 4.997752939107372e-06, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 4.06965970993042, + "learning_rate": 4.997544052369642e-06, + "loss": 0.9683, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 3.3247246742248535, + "learning_rate": 4.997325887990999e-06, + "loss": 0.9414, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 5.811742782592773, + "learning_rate": 4.997098446781861e-06, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 2.661334753036499, + "learning_rate": 4.996861729587103e-06, + "loss": 0.7708, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.863943576812744, + "learning_rate": 4.996615737286061e-06, + "loss": 0.6995, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 20.376733779907227, + "learning_rate": 4.996360470792524e-06, + "loss": 1.2563, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 3.62265682220459, + "learning_rate": 4.996095931054731e-06, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 3.915076732635498, + "learning_rate": 4.9958221190553705e-06, + "loss": 0.9227, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 3.129855155944824, + "learning_rate": 4.995539035811572e-06, + "loss": 0.701, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 2.7532224655151367, + "learning_rate": 4.9952466823749076e-06, + "loss": 0.6491, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 2.8444128036499023, + "learning_rate": 4.9949450598313835e-06, + "loss": 0.8029, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.57743239402771, + "learning_rate": 4.994634169301439e-06, + "loss": 0.8785, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 3.280055284500122, + "learning_rate": 4.994314011939941e-06, + "loss": 1.034, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 2.455838680267334, + "learning_rate": 4.99398458893618e-06, + "loss": 0.8557, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 4.72681188583374, + "learning_rate": 4.993645901513865e-06, + "loss": 1.1904, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 3.0585641860961914, + "learning_rate": 4.993297950931121e-06, + "loss": 0.7668, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.4603540897369385, + "learning_rate": 4.9929407384804806e-06, + "loss": 0.8812, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.9702436923980713, + "learning_rate": 4.992574265488883e-06, + "loss": 0.8878, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 2.6973602771759033, + "learning_rate": 4.9921985333176694e-06, + "loss": 0.7251, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 2.5542335510253906, + "learning_rate": 4.991813543362572e-06, + "loss": 0.6638, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.7530782222747803, + "learning_rate": 4.991419297053716e-06, + "loss": 1.0725, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 2.6483025550842285, + "learning_rate": 4.991015795855611e-06, + "loss": 0.7238, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 3.434422492980957, + "learning_rate": 4.990603041267144e-06, + "loss": 0.9188, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.914340019226074, + "learning_rate": 4.990181034821578e-06, + "loss": 0.6158, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 2.7211625576019287, + "learning_rate": 4.98974977808654e-06, + "loss": 0.7165, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.8414249420166016, + "learning_rate": 4.989309272664026e-06, + "loss": 0.7277, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 3.683204412460327, + "learning_rate": 4.988859520190381e-06, + "loss": 0.9793, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 3.1732583045959473, + "learning_rate": 4.988400522336304e-06, + "loss": 0.8966, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 2.7789194583892822, + "learning_rate": 4.9879322808068365e-06, + "loss": 0.8191, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.754816770553589, + "learning_rate": 4.987454797341358e-06, + "loss": 0.6308, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.730104684829712, + "learning_rate": 4.98696807371358e-06, + "loss": 0.8226, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 3.2225449085235596, + "learning_rate": 4.986472111731536e-06, + "loss": 0.9184, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 3.2684760093688965, + "learning_rate": 4.985966913237581e-06, + "loss": 0.6593, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.43105411529541, + "learning_rate": 4.985452480108376e-06, + "loss": 0.6994, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 7.366360664367676, + "learning_rate": 4.984928814254889e-06, + "loss": 1.1374, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.81864333152771, + "learning_rate": 4.984395917622387e-06, + "loss": 0.8097, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 3.1107730865478516, + "learning_rate": 4.9838537921904206e-06, + "loss": 0.8511, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 2.460545301437378, + "learning_rate": 4.9833024399728295e-06, + "loss": 0.898, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.921992778778076, + "learning_rate": 4.982741863017722e-06, + "loss": 0.6671, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 3.3006443977355957, + "learning_rate": 4.982172063407479e-06, + "loss": 1.0559, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.642587661743164, + "learning_rate": 4.9815930432587365e-06, + "loss": 0.6663, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.905898094177246, + "learning_rate": 4.981004804722384e-06, + "loss": 0.6895, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.9174182415008545, + "learning_rate": 4.980407349983556e-06, + "loss": 0.7982, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 2.214322805404663, + "learning_rate": 4.979800681261619e-06, + "loss": 0.6808, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 2.7152462005615234, + "learning_rate": 4.9791848008101705e-06, + "loss": 0.567, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.5657734870910645, + "learning_rate": 4.978559710917024e-06, + "loss": 0.7745, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 3.9103832244873047, + "learning_rate": 4.977925413904205e-06, + "loss": 0.9815, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 4.610236644744873, + "learning_rate": 4.9772819121279395e-06, + "loss": 1.164, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 3.01170015335083, + "learning_rate": 4.976629207978648e-06, + "loss": 0.7587, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 3.175889253616333, + "learning_rate": 4.975967303880933e-06, + "loss": 0.58, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 2.503741502761841, + "learning_rate": 4.975296202293575e-06, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 2.6778078079223633, + "learning_rate": 4.974615905709518e-06, + "loss": 0.7352, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 5.950812816619873, + "learning_rate": 4.973926416655863e-06, + "loss": 1.0643, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 3.0165305137634277, + "learning_rate": 4.973227737693858e-06, + "loss": 0.6699, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 4.793259620666504, + "learning_rate": 4.972519871418894e-06, + "loss": 1.0315, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 3.632815361022949, + "learning_rate": 4.971802820460481e-06, + "loss": 0.7003, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 3.077507734298706, + "learning_rate": 4.971076587482254e-06, + "loss": 0.6776, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 3.3886241912841797, + "learning_rate": 4.970341175181957e-06, + "loss": 0.7422, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 2.71288800239563, + "learning_rate": 4.969596586291425e-06, + "loss": 0.7471, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.777920961380005, + "learning_rate": 4.968842823576592e-06, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 6.496985912322998, + "learning_rate": 4.968079889837461e-06, + "loss": 0.9965, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.6163430213928223, + "learning_rate": 4.967307787908108e-06, + "loss": 0.6833, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 3.244098663330078, + "learning_rate": 4.966526520656663e-06, + "loss": 0.8373, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.9027860164642334, + "learning_rate": 4.965736090985305e-06, + "loss": 0.8529, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 2.3786230087280273, + "learning_rate": 4.964936501830246e-06, + "loss": 0.6577, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 7.3099045753479, + "learning_rate": 4.964127756161727e-06, + "loss": 1.1184, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 3.068873167037964, + "learning_rate": 4.963309856983998e-06, + "loss": 0.7906, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 3.082547426223755, + "learning_rate": 4.9624828073353144e-06, + "loss": 0.8107, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.4586973190307617, + "learning_rate": 4.961646610287922e-06, + "loss": 0.7421, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 2.779277801513672, + "learning_rate": 4.960801268948047e-06, + "loss": 0.7134, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 3.2255213260650635, + "learning_rate": 4.959946786455882e-06, + "loss": 0.5875, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 2.783395528793335, + "learning_rate": 4.959083165985581e-06, + "loss": 0.6595, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 2.240114212036133, + "learning_rate": 4.958210410745237e-06, + "loss": 0.793, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.9399421215057373, + "learning_rate": 4.957328523976879e-06, + "loss": 0.5896, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 3.4449355602264404, + "learning_rate": 4.956437508956458e-06, + "loss": 0.8658, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 4.273710250854492, + "learning_rate": 4.9555373689938325e-06, + "loss": 0.8316, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.4222047328948975, + "learning_rate": 4.954628107432757e-06, + "loss": 1.0613, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 2.5318963527679443, + "learning_rate": 4.95370972765087e-06, + "loss": 0.7194, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 2.7852585315704346, + "learning_rate": 4.952782233059683e-06, + "loss": 0.5927, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 2.6532323360443115, + "learning_rate": 4.951845627104565e-06, + "loss": 0.8505, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 2.3213467597961426, + "learning_rate": 4.95089991326473e-06, + "loss": 0.8682, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 2.607992649078369, + "learning_rate": 4.9499450950532305e-06, + "loss": 0.8735, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 3.9820072650909424, + "learning_rate": 4.94898117601693e-06, + "loss": 1.0571, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 3.3878824710845947, + "learning_rate": 4.948008159736507e-06, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 2.6935670375823975, + "learning_rate": 4.94702604982643e-06, + "loss": 0.5968, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.78190016746521, + "learning_rate": 4.9460348499349485e-06, + "loss": 0.7504, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 2.973083972930908, + "learning_rate": 4.945034563744077e-06, + "loss": 0.6728, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 2.631803512573242, + "learning_rate": 4.944025194969586e-06, + "loss": 0.609, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 2.7443883419036865, + "learning_rate": 4.9430067473609825e-06, + "loss": 0.8713, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 2.543769121170044, + "learning_rate": 4.941979224701499e-06, + "loss": 0.8035, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 3.7799901962280273, + "learning_rate": 4.94094263080808e-06, + "loss": 0.9341, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 3.1234734058380127, + "learning_rate": 4.939896969531367e-06, + "loss": 1.1066, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 2.356036424636841, + "learning_rate": 4.938842244755683e-06, + "loss": 0.853, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 3.6231274604797363, + "learning_rate": 4.937778460399022e-06, + "loss": 0.9116, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 3.1277005672454834, + "learning_rate": 4.936705620413028e-06, + "loss": 0.5888, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.7338361740112305, + "learning_rate": 4.935623728782986e-06, + "loss": 0.592, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 2.748363733291626, + "learning_rate": 4.934532789527805e-06, + "loss": 0.8713, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 4.460031986236572, + "learning_rate": 4.933432806700004e-06, + "loss": 0.6791, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 2.392911911010742, + "learning_rate": 4.932323784385693e-06, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.7804384231567383, + "learning_rate": 4.931205726704566e-06, + "loss": 0.7547, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.7664780616760254, + "learning_rate": 4.930078637809878e-06, + "loss": 0.7849, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 2.592808723449707, + "learning_rate": 4.928942521888431e-06, + "loss": 0.7015, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 2.7080585956573486, + "learning_rate": 4.927797383160561e-06, + "loss": 1.0028, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.7941503524780273, + "learning_rate": 4.926643225880123e-06, + "loss": 0.602, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 3.2796623706817627, + "learning_rate": 4.925480054334471e-06, + "loss": 0.7473, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 2.7623610496520996, + "learning_rate": 4.924307872844444e-06, + "loss": 1.0573, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.6224453449249268, + "learning_rate": 4.923126685764351e-06, + "loss": 0.7399, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 17.736326217651367, + "learning_rate": 4.921936497481956e-06, + "loss": 0.9548, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.504213333129883, + "learning_rate": 4.920737312418456e-06, + "loss": 0.6748, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 3.617077350616455, + "learning_rate": 4.919529135028473e-06, + "loss": 0.8431, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.6559832096099854, + "learning_rate": 4.918311969800027e-06, + "loss": 0.7243, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 2.7539305686950684, + "learning_rate": 4.917085821254532e-06, + "loss": 0.7845, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 3.3587615489959717, + "learning_rate": 4.915850693946766e-06, + "loss": 0.4891, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 3.064354181289673, + "learning_rate": 4.914606592464865e-06, + "loss": 0.7917, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 3.2505199909210205, + "learning_rate": 4.9133535214303e-06, + "loss": 0.9681, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 3.8027830123901367, + "learning_rate": 4.91209148549786e-06, + "loss": 0.9275, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 2.4154372215270996, + "learning_rate": 4.910820489355637e-06, + "loss": 0.7259, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.892462968826294, + "learning_rate": 4.909540537725007e-06, + "loss": 0.6061, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 3.3398196697235107, + "learning_rate": 4.908251635360616e-06, + "loss": 1.0559, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 3.022512197494507, + "learning_rate": 4.906953787050354e-06, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 2.658661365509033, + "learning_rate": 4.905646997615347e-06, + "loss": 0.6234, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 3.454400062561035, + "learning_rate": 4.904331271909932e-06, + "loss": 0.8066, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 3.1300277709960938, + "learning_rate": 4.903006614821645e-06, + "loss": 0.6861, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 2.362537145614624, + "learning_rate": 4.901673031271194e-06, + "loss": 0.6112, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 3.375577688217163, + "learning_rate": 4.900330526212451e-06, + "loss": 0.6314, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 2.955656051635742, + "learning_rate": 4.898979104632427e-06, + "loss": 0.889, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 2.9285926818847656, + "learning_rate": 4.897618771551255e-06, + "loss": 0.6406, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.131819725036621, + "learning_rate": 4.8962495320221714e-06, + "loss": 0.6368, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 2.780649185180664, + "learning_rate": 4.8948713911315e-06, + "loss": 0.8642, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 2.941500186920166, + "learning_rate": 4.8934843539986266e-06, + "loss": 0.714, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.892088425775986e-06, + "loss": 0.8365, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 2.6887171268463135, + "learning_rate": 4.890683611649041e-06, + "loss": 0.7937, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 3.7638463973999023, + "learning_rate": 4.8892699168362626e-06, + "loss": 0.7485, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.8132755756378174, + "learning_rate": 4.887847346589111e-06, + "loss": 0.6467, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.652247190475464, + "learning_rate": 4.886415906192015e-06, + "loss": 0.4651, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 2.5854647159576416, + "learning_rate": 4.884975600962355e-06, + "loss": 0.8756, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 3.1630544662475586, + "learning_rate": 4.883526436250441e-06, + "loss": 0.7339, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.84452748298645, + "learning_rate": 4.8820684174394935e-06, + "loss": 0.7808, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 3.604048490524292, + "learning_rate": 4.880601549945622e-06, + "loss": 0.96, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 2.302924871444702, + "learning_rate": 4.879125839217808e-06, + "loss": 0.8122, + "step": 163 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 3.1254405975341797, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.7307, + "step": 164 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 2.745603322982788, + "learning_rate": 4.8761479100205085e-06, + "loss": 0.7554, + "step": 165 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 2.494840145111084, + "learning_rate": 4.874645702613152e-06, + "loss": 0.4372, + "step": 166 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 2.3526735305786133, + "learning_rate": 4.873134674096072e-06, + "loss": 0.3597, + "step": 167 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 2.945887804031372, + "learning_rate": 4.871614830082297e-06, + "loss": 0.5854, + "step": 168 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 3.5723934173583984, + "learning_rate": 4.870086176217597e-06, + "loss": 0.7978, + "step": 169 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 3.2997145652770996, + "learning_rate": 4.868548718180473e-06, + "loss": 0.5593, + "step": 170 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 3.4120635986328125, + "learning_rate": 4.867002461682129e-06, + "loss": 0.4083, + "step": 171 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 2.697617292404175, + "learning_rate": 4.8654474124664505e-06, + "loss": 0.4752, + "step": 172 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 5.082247734069824, + "learning_rate": 4.863883576309991e-06, + "loss": 0.7435, + "step": 173 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 2.773864984512329, + "learning_rate": 4.8623109590219395e-06, + "loss": 0.4612, + "step": 174 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 3.429703712463379, + "learning_rate": 4.860729566444106e-06, + "loss": 0.4644, + "step": 175 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 2.997938394546509, + "learning_rate": 4.8591394044508985e-06, + "loss": 0.4852, + "step": 176 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 2.549513339996338, + "learning_rate": 4.857540478949302e-06, + "loss": 0.4574, + "step": 177 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 3.459400177001953, + "learning_rate": 4.855932795878852e-06, + "loss": 0.8095, + "step": 178 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 2.8103644847869873, + "learning_rate": 4.854316361211619e-06, + "loss": 0.4578, + "step": 179 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 2.631221055984497, + "learning_rate": 4.852691180952183e-06, + "loss": 0.5473, + "step": 180 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 3.189946174621582, + "learning_rate": 4.851057261137608e-06, + "loss": 0.4313, + "step": 181 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 2.891418933868408, + "learning_rate": 4.8494146078374274e-06, + "loss": 0.4197, + "step": 182 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 3.239637613296509, + "learning_rate": 4.847763227153612e-06, + "loss": 0.5865, + "step": 183 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 2.484644651412964, + "learning_rate": 4.846103125220557e-06, + "loss": 0.3866, + "step": 184 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 3.1045992374420166, + "learning_rate": 4.844434308205052e-06, + "loss": 0.5357, + "step": 185 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 2.648472309112549, + "learning_rate": 4.842756782306261e-06, + "loss": 0.4783, + "step": 186 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 2.5685644149780273, + "learning_rate": 4.841070553755697e-06, + "loss": 0.3733, + "step": 187 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 3.7727200984954834, + "learning_rate": 4.839375628817205e-06, + "loss": 0.6039, + "step": 188 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 2.8237369060516357, + "learning_rate": 4.837672013786931e-06, + "loss": 0.5372, + "step": 189 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 3.0312252044677734, + "learning_rate": 4.835959714993305e-06, + "loss": 0.5162, + "step": 190 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 2.821498394012451, + "learning_rate": 4.8342387387970105e-06, + "loss": 0.4537, + "step": 191 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 2.7834129333496094, + "learning_rate": 4.832509091590968e-06, + "loss": 0.6165, + "step": 192 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 2.9274091720581055, + "learning_rate": 4.830770779800309e-06, + "loss": 0.7475, + "step": 193 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 2.813945770263672, + "learning_rate": 4.829023809882349e-06, + "loss": 0.4629, + "step": 194 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 2.27876877784729, + "learning_rate": 4.827268188326567e-06, + "loss": 0.5208, + "step": 195 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 2.8444204330444336, + "learning_rate": 4.825503921654582e-06, + "loss": 0.6521, + "step": 196 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 3.3730578422546387, + "learning_rate": 4.823731016420122e-06, + "loss": 0.7491, + "step": 197 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 2.9717822074890137, + "learning_rate": 4.821949479209011e-06, + "loss": 0.3866, + "step": 198 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 2.6570653915405273, + "learning_rate": 4.820159316639133e-06, + "loss": 0.499, + "step": 199 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 2.819960117340088, + "learning_rate": 4.818360535360418e-06, + "loss": 0.556, + "step": 200 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 2.7912111282348633, + "learning_rate": 4.816553142054806e-06, + "loss": 0.3433, + "step": 201 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 2.6427981853485107, + "learning_rate": 4.814737143436232e-06, + "loss": 0.8808, + "step": 202 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 2.5917580127716064, + "learning_rate": 4.812912546250595e-06, + "loss": 0.5718, + "step": 203 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 3.770759344100952, + "learning_rate": 4.81107935727574e-06, + "loss": 0.9743, + "step": 204 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 2.558248996734619, + "learning_rate": 4.809237583321421e-06, + "loss": 0.2821, + "step": 205 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 2.692087173461914, + "learning_rate": 4.807387231229287e-06, + "loss": 0.7524, + "step": 206 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 2.661738157272339, + "learning_rate": 4.8055283078728525e-06, + "loss": 0.4304, + "step": 207 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 2.9232122898101807, + "learning_rate": 4.803660820157468e-06, + "loss": 0.6986, + "step": 208 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 2.665097951889038, + "learning_rate": 4.801784775020303e-06, + "loss": 0.7112, + "step": 209 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 2.4504497051239014, + "learning_rate": 4.799900179430312e-06, + "loss": 0.4125, + "step": 210 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 3.076204538345337, + "learning_rate": 4.798007040388212e-06, + "loss": 0.7057, + "step": 211 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 2.406977653503418, + "learning_rate": 4.7961053649264585e-06, + "loss": 0.708, + "step": 212 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 2.6545324325561523, + "learning_rate": 4.794195160109215e-06, + "loss": 0.7608, + "step": 213 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 4.3817033767700195, + "learning_rate": 4.7922764330323315e-06, + "loss": 0.4779, + "step": 214 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 3.534566879272461, + "learning_rate": 4.790349190823313e-06, + "loss": 0.5464, + "step": 215 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 3.0323140621185303, + "learning_rate": 4.788413440641297e-06, + "loss": 0.6198, + "step": 216 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 2.612746238708496, + "learning_rate": 4.786469189677026e-06, + "loss": 0.6695, + "step": 217 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 3.0299434661865234, + "learning_rate": 4.784516445152821e-06, + "loss": 0.4902, + "step": 218 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 3.4521942138671875, + "learning_rate": 4.78255521432255e-06, + "loss": 0.7411, + "step": 219 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 2.6712653636932373, + "learning_rate": 4.780585504471612e-06, + "loss": 0.8767, + "step": 220 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 2.5099475383758545, + "learning_rate": 4.778607322916896e-06, + "loss": 0.4266, + "step": 221 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 2.641799211502075, + "learning_rate": 4.776620677006766e-06, + "loss": 0.4982, + "step": 222 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 3.1119771003723145, + "learning_rate": 4.7746255741210256e-06, + "loss": 0.6012, + "step": 223 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 3.9957170486450195, + "learning_rate": 4.772622021670897e-06, + "loss": 0.7585, + "step": 224 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 3.1070823669433594, + "learning_rate": 4.770610027098983e-06, + "loss": 0.5266, + "step": 225 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 2.7630460262298584, + "learning_rate": 4.7685895978792564e-06, + "loss": 0.6261, + "step": 226 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 2.6509556770324707, + "learning_rate": 4.766560741517014e-06, + "loss": 0.7081, + "step": 227 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 3.0212976932525635, + "learning_rate": 4.76452346554886e-06, + "loss": 0.5041, + "step": 228 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 3.0454728603363037, + "learning_rate": 4.762477777542676e-06, + "loss": 0.49, + "step": 229 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 3.4296791553497314, + "learning_rate": 4.7604236850975905e-06, + "loss": 0.7056, + "step": 230 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 4.1885600090026855, + "learning_rate": 4.7583611958439514e-06, + "loss": 0.7762, + "step": 231 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 3.065854072570801, + "learning_rate": 4.7562903174433e-06, + "loss": 0.5347, + "step": 232 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 2.793851852416992, + "learning_rate": 4.75421105758834e-06, + "loss": 0.503, + "step": 233 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 3.123730421066284, + "learning_rate": 4.752123424002908e-06, + "loss": 0.5081, + "step": 234 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 3.230161666870117, + "learning_rate": 4.750027424441949e-06, + "loss": 0.7523, + "step": 235 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 2.4970247745513916, + "learning_rate": 4.747923066691487e-06, + "loss": 0.5575, + "step": 236 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 2.9880685806274414, + "learning_rate": 4.745810358568588e-06, + "loss": 0.7264, + "step": 237 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 2.555328369140625, + "learning_rate": 4.743689307921342e-06, + "loss": 0.4545, + "step": 238 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 3.144932746887207, + "learning_rate": 4.741559922628828e-06, + "loss": 0.5429, + "step": 239 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 3.059807062149048, + "learning_rate": 4.739422210601085e-06, + "loss": 0.5086, + "step": 240 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 3.374303102493286, + "learning_rate": 4.7372761797790836e-06, + "loss": 0.6109, + "step": 241 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 2.4506947994232178, + "learning_rate": 4.735121838134697e-06, + "loss": 0.4317, + "step": 242 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 2.9039974212646484, + "learning_rate": 4.732959193670672e-06, + "loss": 0.6414, + "step": 243 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 2.9412453174591064, + "learning_rate": 4.730788254420593e-06, + "loss": 0.5166, + "step": 244 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 2.500716209411621, + "learning_rate": 4.728609028448862e-06, + "loss": 0.4982, + "step": 245 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 2.4233803749084473, + "learning_rate": 4.726421523850662e-06, + "loss": 0.7552, + "step": 246 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 2.357003688812256, + "learning_rate": 4.7242257487519275e-06, + "loss": 0.4365, + "step": 247 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 2.6406495571136475, + "learning_rate": 4.722021711309317e-06, + "loss": 0.6002, + "step": 248 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 2.736884832382202, + "learning_rate": 4.7198094197101826e-06, + "loss": 0.4993, + "step": 249 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 3.5238845348358154, + "learning_rate": 4.7175888821725335e-06, + "loss": 0.4637, + "step": 250 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 3.3783695697784424, + "learning_rate": 4.715360106945015e-06, + "loss": 0.9711, + "step": 251 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 2.9685862064361572, + "learning_rate": 4.713123102306869e-06, + "loss": 0.5452, + "step": 252 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 3.143733263015747, + "learning_rate": 4.710877876567912e-06, + "loss": 0.5034, + "step": 253 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 2.8005623817443848, + "learning_rate": 4.708624438068494e-06, + "loss": 0.4236, + "step": 254 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 2.66581130027771, + "learning_rate": 4.706362795179476e-06, + "loss": 0.6095, + "step": 255 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 4.598043441772461, + "learning_rate": 4.7040929563021975e-06, + "loss": 0.738, + "step": 256 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 3.5643506050109863, + "learning_rate": 4.70181492986844e-06, + "loss": 0.6726, + "step": 257 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 2.865339994430542, + "learning_rate": 4.699528724340401e-06, + "loss": 0.4862, + "step": 258 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 2.95529842376709, + "learning_rate": 4.6972343482106615e-06, + "loss": 0.5003, + "step": 259 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 2.45206356048584, + "learning_rate": 4.6949318100021546e-06, + "loss": 0.6734, + "step": 260 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 2.6789939403533936, + "learning_rate": 4.6926211182681295e-06, + "loss": 0.5639, + "step": 261 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 3.307732582092285, + "learning_rate": 4.690302281592128e-06, + "loss": 0.7032, + "step": 262 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 2.8950445652008057, + "learning_rate": 4.687975308587944e-06, + "loss": 0.4937, + "step": 263 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 2.969377040863037, + "learning_rate": 4.685640207899598e-06, + "loss": 0.5829, + "step": 264 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 3.106433391571045, + "learning_rate": 4.683296988201301e-06, + "loss": 0.3805, + "step": 265 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 3.5599050521850586, + "learning_rate": 4.680945658197425e-06, + "loss": 0.7939, + "step": 266 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 5.008603096008301, + "learning_rate": 4.6785862266224695e-06, + "loss": 0.7511, + "step": 267 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 3.1393773555755615, + "learning_rate": 4.676218702241026e-06, + "loss": 0.8984, + "step": 268 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 3.0241408348083496, + "learning_rate": 4.673843093847753e-06, + "loss": 0.5473, + "step": 269 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 2.9029417037963867, + "learning_rate": 4.6714594102673355e-06, + "loss": 0.6626, + "step": 270 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 3.4709246158599854, + "learning_rate": 4.669067660354456e-06, + "loss": 0.5015, + "step": 271 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 2.988635778427124, + "learning_rate": 4.666667852993761e-06, + "loss": 0.5384, + "step": 272 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 3.418140411376953, + "learning_rate": 4.664259997099829e-06, + "loss": 0.7491, + "step": 273 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 2.592416763305664, + "learning_rate": 4.661844101617135e-06, + "loss": 0.6451, + "step": 274 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 3.1174306869506836, + "learning_rate": 4.6594201755200205e-06, + "loss": 0.6299, + "step": 275 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 2.6569998264312744, + "learning_rate": 4.656988227812658e-06, + "loss": 0.4477, + "step": 276 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 3.5733959674835205, + "learning_rate": 4.654548267529015e-06, + "loss": 0.5473, + "step": 277 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 2.7240824699401855, + "learning_rate": 4.652100303732827e-06, + "loss": 0.496, + "step": 278 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 4.1965460777282715, + "learning_rate": 4.64964434551756e-06, + "loss": 0.932, + "step": 279 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 2.3237173557281494, + "learning_rate": 4.647180402006372e-06, + "loss": 0.4648, + "step": 280 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 3.395045042037964, + "learning_rate": 4.644708482352093e-06, + "loss": 0.7237, + "step": 281 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 3.238593816757202, + "learning_rate": 4.6422285957371735e-06, + "loss": 0.5531, + "step": 282 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 3.9651403427124023, + "learning_rate": 4.639740751373663e-06, + "loss": 0.6706, + "step": 283 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 3.0042061805725098, + "learning_rate": 4.63724495850317e-06, + "loss": 0.56, + "step": 284 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 3.094310760498047, + "learning_rate": 4.634741226396832e-06, + "loss": 0.6138, + "step": 285 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 2.838168144226074, + "learning_rate": 4.632229564355275e-06, + "loss": 0.4908, + "step": 286 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 3.3452796936035156, + "learning_rate": 4.629709981708586e-06, + "loss": 0.8181, + "step": 287 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 2.6630783081054688, + "learning_rate": 4.6271824878162704e-06, + "loss": 0.5625, + "step": 288 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 2.583650588989258, + "learning_rate": 4.624647092067226e-06, + "loss": 0.3416, + "step": 289 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 2.73132586479187, + "learning_rate": 4.622103803879702e-06, + "loss": 0.3889, + "step": 290 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 4.1010260581970215, + "learning_rate": 4.619552632701263e-06, + "loss": 0.611, + "step": 291 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 4.53068208694458, + "learning_rate": 4.61699358800876e-06, + "loss": 0.7219, + "step": 292 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 3.4877254962921143, + "learning_rate": 4.614426679308291e-06, + "loss": 0.6402, + "step": 293 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 2.9445226192474365, + "learning_rate": 4.611851916135166e-06, + "loss": 0.509, + "step": 294 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 2.6622228622436523, + "learning_rate": 4.609269308053872e-06, + "loss": 0.6167, + "step": 295 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 3.131530523300171, + "learning_rate": 4.606678864658039e-06, + "loss": 0.8039, + "step": 296 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 3.212188482284546, + "learning_rate": 4.604080595570399e-06, + "loss": 0.5754, + "step": 297 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 3.522850275039673, + "learning_rate": 4.601474510442759e-06, + "loss": 0.4432, + "step": 298 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 2.5877151489257812, + "learning_rate": 4.598860618955957e-06, + "loss": 0.6541, + "step": 299 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 2.803833484649658, + "learning_rate": 4.596238930819832e-06, + "loss": 0.5824, + "step": 300 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 2.7125494480133057, + "learning_rate": 4.5936094557731815e-06, + "loss": 0.6976, + "step": 301 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 3.6549370288848877, + "learning_rate": 4.590972203583732e-06, + "loss": 0.7105, + "step": 302 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 3.3241944313049316, + "learning_rate": 4.588327184048099e-06, + "loss": 0.7446, + "step": 303 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 2.8388822078704834, + "learning_rate": 4.585674406991752e-06, + "loss": 0.4926, + "step": 304 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 2.9760420322418213, + "learning_rate": 4.5830138822689755e-06, + "loss": 0.7368, + "step": 305 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 2.5437633991241455, + "learning_rate": 4.5803456197628374e-06, + "loss": 0.4678, + "step": 306 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 3.0044775009155273, + "learning_rate": 4.577669629385145e-06, + "loss": 0.4241, + "step": 307 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 2.6150901317596436, + "learning_rate": 4.574985921076418e-06, + "loss": 0.5327, + "step": 308 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 2.4425182342529297, + "learning_rate": 4.572294504805841e-06, + "loss": 0.7504, + "step": 309 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 2.9920194149017334, + "learning_rate": 4.569595390571232e-06, + "loss": 0.5194, + "step": 310 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 2.701087713241577, + "learning_rate": 4.566888588399007e-06, + "loss": 0.6862, + "step": 311 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 7.628893852233887, + "learning_rate": 4.564174108344139e-06, + "loss": 0.6867, + "step": 312 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 2.712947130203247, + "learning_rate": 4.561451960490123e-06, + "loss": 0.6942, + "step": 313 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 3.0063202381134033, + "learning_rate": 4.558722154948937e-06, + "loss": 0.6346, + "step": 314 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 2.957218647003174, + "learning_rate": 4.5559847018610034e-06, + "loss": 0.464, + "step": 315 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 3.322282552719116, + "learning_rate": 4.553239611395156e-06, + "loss": 0.6334, + "step": 316 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 3.0638647079467773, + "learning_rate": 4.550486893748596e-06, + "loss": 0.4227, + "step": 317 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 3.079087257385254, + "learning_rate": 4.547726559146862e-06, + "loss": 0.3719, + "step": 318 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 2.409914255142212, + "learning_rate": 4.544958617843782e-06, + "loss": 0.3331, + "step": 319 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 3.3441262245178223, + "learning_rate": 4.542183080121444e-06, + "loss": 0.6931, + "step": 320 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 2.6624436378479004, + "learning_rate": 4.539399956290152e-06, + "loss": 0.6578, + "step": 321 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 3.463789224624634, + "learning_rate": 4.536609256688396e-06, + "loss": 0.5748, + "step": 322 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 3.6827807426452637, + "learning_rate": 4.533810991682799e-06, + "loss": 0.5249, + "step": 323 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 4.125547409057617, + "learning_rate": 4.531005171668093e-06, + "loss": 0.3065, + "step": 324 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 2.935978412628174, + "learning_rate": 4.528191807067074e-06, + "loss": 0.5523, + "step": 325 + }, + { + "epoch": 2.0, + "grad_norm": 2.654388427734375, + "learning_rate": 4.525370908330564e-06, + "loss": 0.4157, + "step": 326 + } + ], + "logging_steps": 1, + "max_steps": 1630, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.070798073082675e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56ba48611d00c8914e13c2391eae7ea287097af9 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json @@ -0,0 +1,3457 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 489, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 5.908512115478516, + "learning_rate": 5e-06, + "loss": 0.9606, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 4.304474353790283, + "learning_rate": 4.999995356617983e-06, + "loss": 0.8609, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 5.63697624206543, + "learning_rate": 4.999981426489179e-06, + "loss": 1.3543, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 3.6674246788024902, + "learning_rate": 4.999958209665336e-06, + "loss": 0.787, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 48.14854431152344, + "learning_rate": 4.999925706232695e-06, + "loss": 1.7786, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 7.8689866065979, + "learning_rate": 4.999883916312e-06, + "loss": 1.2175, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 5.119968891143799, + "learning_rate": 4.9998328400584864e-06, + "loss": 0.8998, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 3.730757713317871, + "learning_rate": 4.999772477661888e-06, + "loss": 0.8419, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 27.314565658569336, + "learning_rate": 4.999702829346432e-06, + "loss": 1.7948, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.822697162628174, + "learning_rate": 4.999623895370843e-06, + "loss": 1.0461, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 4.71220588684082, + "learning_rate": 4.999535676028338e-06, + "loss": 1.0, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.2378087043762207, + "learning_rate": 4.999438171646624e-06, + "loss": 0.9475, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 3.475543737411499, + "learning_rate": 4.999331382587901e-06, + "loss": 0.8654, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 10.06365966796875, + "learning_rate": 4.999215309248861e-06, + "loss": 1.2042, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 3.785153865814209, + "learning_rate": 4.999089952060681e-06, + "loss": 0.8846, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 2.944488048553467, + "learning_rate": 4.998955311489025e-06, + "loss": 0.8805, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 39.89304733276367, + "learning_rate": 4.998811388034046e-06, + "loss": 1.5882, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 3.5883963108062744, + "learning_rate": 4.9986581822303746e-06, + "loss": 0.9222, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 6.972247123718262, + "learning_rate": 4.998495694647127e-06, + "loss": 1.4088, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 3.948991298675537, + "learning_rate": 4.998323925887895e-06, + "loss": 1.454, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 3.8690035343170166, + "learning_rate": 4.998142876590749e-06, + "loss": 0.6335, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 5.243765830993652, + "learning_rate": 4.997952547428236e-06, + "loss": 0.6725, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 3.5994043350219727, + "learning_rate": 4.997752939107372e-06, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 4.06965970993042, + "learning_rate": 4.997544052369642e-06, + "loss": 0.9683, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 3.3247246742248535, + "learning_rate": 4.997325887990999e-06, + "loss": 0.9414, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 5.811742782592773, + "learning_rate": 4.997098446781861e-06, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 2.661334753036499, + "learning_rate": 4.996861729587103e-06, + "loss": 0.7708, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.863943576812744, + "learning_rate": 4.996615737286061e-06, + "loss": 0.6995, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 20.376733779907227, + "learning_rate": 4.996360470792524e-06, + "loss": 1.2563, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 3.62265682220459, + "learning_rate": 4.996095931054731e-06, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 3.915076732635498, + "learning_rate": 4.9958221190553705e-06, + "loss": 0.9227, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 3.129855155944824, + "learning_rate": 4.995539035811572e-06, + "loss": 0.701, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 2.7532224655151367, + "learning_rate": 4.9952466823749076e-06, + "loss": 0.6491, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 2.8444128036499023, + "learning_rate": 4.9949450598313835e-06, + "loss": 0.8029, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.57743239402771, + "learning_rate": 4.994634169301439e-06, + "loss": 0.8785, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 3.280055284500122, + "learning_rate": 4.994314011939941e-06, + "loss": 1.034, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 2.455838680267334, + "learning_rate": 4.99398458893618e-06, + "loss": 0.8557, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 4.72681188583374, + "learning_rate": 4.993645901513865e-06, + "loss": 1.1904, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 3.0585641860961914, + "learning_rate": 4.993297950931121e-06, + "loss": 0.7668, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.4603540897369385, + "learning_rate": 4.9929407384804806e-06, + "loss": 0.8812, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.9702436923980713, + "learning_rate": 4.992574265488883e-06, + "loss": 0.8878, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 2.6973602771759033, + "learning_rate": 4.9921985333176694e-06, + "loss": 0.7251, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 2.5542335510253906, + "learning_rate": 4.991813543362572e-06, + "loss": 0.6638, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.7530782222747803, + "learning_rate": 4.991419297053716e-06, + "loss": 1.0725, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 2.6483025550842285, + "learning_rate": 4.991015795855611e-06, + "loss": 0.7238, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 3.434422492980957, + "learning_rate": 4.990603041267144e-06, + "loss": 0.9188, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.914340019226074, + "learning_rate": 4.990181034821578e-06, + "loss": 0.6158, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 2.7211625576019287, + "learning_rate": 4.98974977808654e-06, + "loss": 0.7165, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.8414249420166016, + "learning_rate": 4.989309272664026e-06, + "loss": 0.7277, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 3.683204412460327, + "learning_rate": 4.988859520190381e-06, + "loss": 0.9793, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 3.1732583045959473, + "learning_rate": 4.988400522336304e-06, + "loss": 0.8966, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 2.7789194583892822, + "learning_rate": 4.9879322808068365e-06, + "loss": 0.8191, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.754816770553589, + "learning_rate": 4.987454797341358e-06, + "loss": 0.6308, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.730104684829712, + "learning_rate": 4.98696807371358e-06, + "loss": 0.8226, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 3.2225449085235596, + "learning_rate": 4.986472111731536e-06, + "loss": 0.9184, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 3.2684760093688965, + "learning_rate": 4.985966913237581e-06, + "loss": 0.6593, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.43105411529541, + "learning_rate": 4.985452480108376e-06, + "loss": 0.6994, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 7.366360664367676, + "learning_rate": 4.984928814254889e-06, + "loss": 1.1374, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.81864333152771, + "learning_rate": 4.984395917622387e-06, + "loss": 0.8097, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 3.1107730865478516, + "learning_rate": 4.9838537921904206e-06, + "loss": 0.8511, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 2.460545301437378, + "learning_rate": 4.9833024399728295e-06, + "loss": 0.898, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.921992778778076, + "learning_rate": 4.982741863017722e-06, + "loss": 0.6671, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 3.3006443977355957, + "learning_rate": 4.982172063407479e-06, + "loss": 1.0559, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.642587661743164, + "learning_rate": 4.9815930432587365e-06, + "loss": 0.6663, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.905898094177246, + "learning_rate": 4.981004804722384e-06, + "loss": 0.6895, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.9174182415008545, + "learning_rate": 4.980407349983556e-06, + "loss": 0.7982, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 2.214322805404663, + "learning_rate": 4.979800681261619e-06, + "loss": 0.6808, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 2.7152462005615234, + "learning_rate": 4.9791848008101705e-06, + "loss": 0.567, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.5657734870910645, + "learning_rate": 4.978559710917024e-06, + "loss": 0.7745, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 3.9103832244873047, + "learning_rate": 4.977925413904205e-06, + "loss": 0.9815, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 4.610236644744873, + "learning_rate": 4.9772819121279395e-06, + "loss": 1.164, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 3.01170015335083, + "learning_rate": 4.976629207978648e-06, + "loss": 0.7587, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 3.175889253616333, + "learning_rate": 4.975967303880933e-06, + "loss": 0.58, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 2.503741502761841, + "learning_rate": 4.975296202293575e-06, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 2.6778078079223633, + "learning_rate": 4.974615905709518e-06, + "loss": 0.7352, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 5.950812816619873, + "learning_rate": 4.973926416655863e-06, + "loss": 1.0643, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 3.0165305137634277, + "learning_rate": 4.973227737693858e-06, + "loss": 0.6699, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 4.793259620666504, + "learning_rate": 4.972519871418894e-06, + "loss": 1.0315, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 3.632815361022949, + "learning_rate": 4.971802820460481e-06, + "loss": 0.7003, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 3.077507734298706, + "learning_rate": 4.971076587482254e-06, + "loss": 0.6776, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 3.3886241912841797, + "learning_rate": 4.970341175181957e-06, + "loss": 0.7422, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 2.71288800239563, + "learning_rate": 4.969596586291425e-06, + "loss": 0.7471, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.777920961380005, + "learning_rate": 4.968842823576592e-06, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 6.496985912322998, + "learning_rate": 4.968079889837461e-06, + "loss": 0.9965, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.6163430213928223, + "learning_rate": 4.967307787908108e-06, + "loss": 0.6833, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 3.244098663330078, + "learning_rate": 4.966526520656663e-06, + "loss": 0.8373, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.9027860164642334, + "learning_rate": 4.965736090985305e-06, + "loss": 0.8529, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 2.3786230087280273, + "learning_rate": 4.964936501830246e-06, + "loss": 0.6577, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 7.3099045753479, + "learning_rate": 4.964127756161727e-06, + "loss": 1.1184, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 3.068873167037964, + "learning_rate": 4.963309856983998e-06, + "loss": 0.7906, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 3.082547426223755, + "learning_rate": 4.9624828073353144e-06, + "loss": 0.8107, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.4586973190307617, + "learning_rate": 4.961646610287922e-06, + "loss": 0.7421, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 2.779277801513672, + "learning_rate": 4.960801268948047e-06, + "loss": 0.7134, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 3.2255213260650635, + "learning_rate": 4.959946786455882e-06, + "loss": 0.5875, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 2.783395528793335, + "learning_rate": 4.959083165985581e-06, + "loss": 0.6595, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 2.240114212036133, + "learning_rate": 4.958210410745237e-06, + "loss": 0.793, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.9399421215057373, + "learning_rate": 4.957328523976879e-06, + "loss": 0.5896, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 3.4449355602264404, + "learning_rate": 4.956437508956458e-06, + "loss": 0.8658, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 4.273710250854492, + "learning_rate": 4.9555373689938325e-06, + "loss": 0.8316, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.4222047328948975, + "learning_rate": 4.954628107432757e-06, + "loss": 1.0613, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 2.5318963527679443, + "learning_rate": 4.95370972765087e-06, + "loss": 0.7194, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 2.7852585315704346, + "learning_rate": 4.952782233059683e-06, + "loss": 0.5927, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 2.6532323360443115, + "learning_rate": 4.951845627104565e-06, + "loss": 0.8505, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 2.3213467597961426, + "learning_rate": 4.95089991326473e-06, + "loss": 0.8682, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 2.607992649078369, + "learning_rate": 4.9499450950532305e-06, + "loss": 0.8735, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 3.9820072650909424, + "learning_rate": 4.94898117601693e-06, + "loss": 1.0571, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 3.3878824710845947, + "learning_rate": 4.948008159736507e-06, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 2.6935670375823975, + "learning_rate": 4.94702604982643e-06, + "loss": 0.5968, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.78190016746521, + "learning_rate": 4.9460348499349485e-06, + "loss": 0.7504, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 2.973083972930908, + "learning_rate": 4.945034563744077e-06, + "loss": 0.6728, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 2.631803512573242, + "learning_rate": 4.944025194969586e-06, + "loss": 0.609, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 2.7443883419036865, + "learning_rate": 4.9430067473609825e-06, + "loss": 0.8713, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 2.543769121170044, + "learning_rate": 4.941979224701499e-06, + "loss": 0.8035, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 3.7799901962280273, + "learning_rate": 4.94094263080808e-06, + "loss": 0.9341, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 3.1234734058380127, + "learning_rate": 4.939896969531367e-06, + "loss": 1.1066, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 2.356036424636841, + "learning_rate": 4.938842244755683e-06, + "loss": 0.853, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 3.6231274604797363, + "learning_rate": 4.937778460399022e-06, + "loss": 0.9116, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 3.1277005672454834, + "learning_rate": 4.936705620413028e-06, + "loss": 0.5888, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.7338361740112305, + "learning_rate": 4.935623728782986e-06, + "loss": 0.592, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 2.748363733291626, + "learning_rate": 4.934532789527805e-06, + "loss": 0.8713, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 4.460031986236572, + "learning_rate": 4.933432806700004e-06, + "loss": 0.6791, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 2.392911911010742, + "learning_rate": 4.932323784385693e-06, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.7804384231567383, + "learning_rate": 4.931205726704566e-06, + "loss": 0.7547, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.7664780616760254, + "learning_rate": 4.930078637809878e-06, + "loss": 0.7849, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 2.592808723449707, + "learning_rate": 4.928942521888431e-06, + "loss": 0.7015, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 2.7080585956573486, + "learning_rate": 4.927797383160561e-06, + "loss": 1.0028, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.7941503524780273, + "learning_rate": 4.926643225880123e-06, + "loss": 0.602, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 3.2796623706817627, + "learning_rate": 4.925480054334471e-06, + "loss": 0.7473, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 2.7623610496520996, + "learning_rate": 4.924307872844444e-06, + "loss": 1.0573, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.6224453449249268, + "learning_rate": 4.923126685764351e-06, + "loss": 0.7399, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 17.736326217651367, + "learning_rate": 4.921936497481956e-06, + "loss": 0.9548, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.504213333129883, + "learning_rate": 4.920737312418456e-06, + "loss": 0.6748, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 3.617077350616455, + "learning_rate": 4.919529135028473e-06, + "loss": 0.8431, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.6559832096099854, + "learning_rate": 4.918311969800027e-06, + "loss": 0.7243, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 2.7539305686950684, + "learning_rate": 4.917085821254532e-06, + "loss": 0.7845, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 3.3587615489959717, + "learning_rate": 4.915850693946766e-06, + "loss": 0.4891, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 3.064354181289673, + "learning_rate": 4.914606592464865e-06, + "loss": 0.7917, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 3.2505199909210205, + "learning_rate": 4.9133535214303e-06, + "loss": 0.9681, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 3.8027830123901367, + "learning_rate": 4.91209148549786e-06, + "loss": 0.9275, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 2.4154372215270996, + "learning_rate": 4.910820489355637e-06, + "loss": 0.7259, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.892462968826294, + "learning_rate": 4.909540537725007e-06, + "loss": 0.6061, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 3.3398196697235107, + "learning_rate": 4.908251635360616e-06, + "loss": 1.0559, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 3.022512197494507, + "learning_rate": 4.906953787050354e-06, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 2.658661365509033, + "learning_rate": 4.905646997615347e-06, + "loss": 0.6234, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 3.454400062561035, + "learning_rate": 4.904331271909932e-06, + "loss": 0.8066, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 3.1300277709960938, + "learning_rate": 4.903006614821645e-06, + "loss": 0.6861, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 2.362537145614624, + "learning_rate": 4.901673031271194e-06, + "loss": 0.6112, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 3.375577688217163, + "learning_rate": 4.900330526212451e-06, + "loss": 0.6314, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 2.955656051635742, + "learning_rate": 4.898979104632427e-06, + "loss": 0.889, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 2.9285926818847656, + "learning_rate": 4.897618771551255e-06, + "loss": 0.6406, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.131819725036621, + "learning_rate": 4.8962495320221714e-06, + "loss": 0.6368, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 2.780649185180664, + "learning_rate": 4.8948713911315e-06, + "loss": 0.8642, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 2.941500186920166, + "learning_rate": 4.8934843539986266e-06, + "loss": 0.714, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.892088425775986e-06, + "loss": 0.8365, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 2.6887171268463135, + "learning_rate": 4.890683611649041e-06, + "loss": 0.7937, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 3.7638463973999023, + "learning_rate": 4.8892699168362626e-06, + "loss": 0.7485, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.8132755756378174, + "learning_rate": 4.887847346589111e-06, + "loss": 0.6467, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.652247190475464, + "learning_rate": 4.886415906192015e-06, + "loss": 0.4651, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 2.5854647159576416, + "learning_rate": 4.884975600962355e-06, + "loss": 0.8756, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 3.1630544662475586, + "learning_rate": 4.883526436250441e-06, + "loss": 0.7339, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.84452748298645, + "learning_rate": 4.8820684174394935e-06, + "loss": 0.7808, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 3.604048490524292, + "learning_rate": 4.880601549945622e-06, + "loss": 0.96, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 2.302924871444702, + "learning_rate": 4.879125839217808e-06, + "loss": 0.8122, + "step": 163 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 3.1254405975341797, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.7307, + "step": 164 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 2.745603322982788, + "learning_rate": 4.8761479100205085e-06, + "loss": 0.7554, + "step": 165 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 2.494840145111084, + "learning_rate": 4.874645702613152e-06, + "loss": 0.4372, + "step": 166 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 2.3526735305786133, + "learning_rate": 4.873134674096072e-06, + "loss": 0.3597, + "step": 167 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 2.945887804031372, + "learning_rate": 4.871614830082297e-06, + "loss": 0.5854, + "step": 168 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 3.5723934173583984, + "learning_rate": 4.870086176217597e-06, + "loss": 0.7978, + "step": 169 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 3.2997145652770996, + "learning_rate": 4.868548718180473e-06, + "loss": 0.5593, + "step": 170 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 3.4120635986328125, + "learning_rate": 4.867002461682129e-06, + "loss": 0.4083, + "step": 171 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 2.697617292404175, + "learning_rate": 4.8654474124664505e-06, + "loss": 0.4752, + "step": 172 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 5.082247734069824, + "learning_rate": 4.863883576309991e-06, + "loss": 0.7435, + "step": 173 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 2.773864984512329, + "learning_rate": 4.8623109590219395e-06, + "loss": 0.4612, + "step": 174 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 3.429703712463379, + "learning_rate": 4.860729566444106e-06, + "loss": 0.4644, + "step": 175 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 2.997938394546509, + "learning_rate": 4.8591394044508985e-06, + "loss": 0.4852, + "step": 176 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 2.549513339996338, + "learning_rate": 4.857540478949302e-06, + "loss": 0.4574, + "step": 177 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 3.459400177001953, + "learning_rate": 4.855932795878852e-06, + "loss": 0.8095, + "step": 178 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 2.8103644847869873, + "learning_rate": 4.854316361211619e-06, + "loss": 0.4578, + "step": 179 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 2.631221055984497, + "learning_rate": 4.852691180952183e-06, + "loss": 0.5473, + "step": 180 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 3.189946174621582, + "learning_rate": 4.851057261137608e-06, + "loss": 0.4313, + "step": 181 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 2.891418933868408, + "learning_rate": 4.8494146078374274e-06, + "loss": 0.4197, + "step": 182 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 3.239637613296509, + "learning_rate": 4.847763227153612e-06, + "loss": 0.5865, + "step": 183 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 2.484644651412964, + "learning_rate": 4.846103125220557e-06, + "loss": 0.3866, + "step": 184 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 3.1045992374420166, + "learning_rate": 4.844434308205052e-06, + "loss": 0.5357, + "step": 185 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 2.648472309112549, + "learning_rate": 4.842756782306261e-06, + "loss": 0.4783, + "step": 186 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 2.5685644149780273, + "learning_rate": 4.841070553755697e-06, + "loss": 0.3733, + "step": 187 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 3.7727200984954834, + "learning_rate": 4.839375628817205e-06, + "loss": 0.6039, + "step": 188 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 2.8237369060516357, + "learning_rate": 4.837672013786931e-06, + "loss": 0.5372, + "step": 189 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 3.0312252044677734, + "learning_rate": 4.835959714993305e-06, + "loss": 0.5162, + "step": 190 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 2.821498394012451, + "learning_rate": 4.8342387387970105e-06, + "loss": 0.4537, + "step": 191 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 2.7834129333496094, + "learning_rate": 4.832509091590968e-06, + "loss": 0.6165, + "step": 192 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 2.9274091720581055, + "learning_rate": 4.830770779800309e-06, + "loss": 0.7475, + "step": 193 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 2.813945770263672, + "learning_rate": 4.829023809882349e-06, + "loss": 0.4629, + "step": 194 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 2.27876877784729, + "learning_rate": 4.827268188326567e-06, + "loss": 0.5208, + "step": 195 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 2.8444204330444336, + "learning_rate": 4.825503921654582e-06, + "loss": 0.6521, + "step": 196 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 3.3730578422546387, + "learning_rate": 4.823731016420122e-06, + "loss": 0.7491, + "step": 197 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 2.9717822074890137, + "learning_rate": 4.821949479209011e-06, + "loss": 0.3866, + "step": 198 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 2.6570653915405273, + "learning_rate": 4.820159316639133e-06, + "loss": 0.499, + "step": 199 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 2.819960117340088, + "learning_rate": 4.818360535360418e-06, + "loss": 0.556, + "step": 200 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 2.7912111282348633, + "learning_rate": 4.816553142054806e-06, + "loss": 0.3433, + "step": 201 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 2.6427981853485107, + "learning_rate": 4.814737143436232e-06, + "loss": 0.8808, + "step": 202 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 2.5917580127716064, + "learning_rate": 4.812912546250595e-06, + "loss": 0.5718, + "step": 203 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 3.770759344100952, + "learning_rate": 4.81107935727574e-06, + "loss": 0.9743, + "step": 204 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 2.558248996734619, + "learning_rate": 4.809237583321421e-06, + "loss": 0.2821, + "step": 205 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 2.692087173461914, + "learning_rate": 4.807387231229287e-06, + "loss": 0.7524, + "step": 206 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 2.661738157272339, + "learning_rate": 4.8055283078728525e-06, + "loss": 0.4304, + "step": 207 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 2.9232122898101807, + "learning_rate": 4.803660820157468e-06, + "loss": 0.6986, + "step": 208 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 2.665097951889038, + "learning_rate": 4.801784775020303e-06, + "loss": 0.7112, + "step": 209 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 2.4504497051239014, + "learning_rate": 4.799900179430312e-06, + "loss": 0.4125, + "step": 210 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 3.076204538345337, + "learning_rate": 4.798007040388212e-06, + "loss": 0.7057, + "step": 211 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 2.406977653503418, + "learning_rate": 4.7961053649264585e-06, + "loss": 0.708, + "step": 212 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 2.6545324325561523, + "learning_rate": 4.794195160109215e-06, + "loss": 0.7608, + "step": 213 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 4.3817033767700195, + "learning_rate": 4.7922764330323315e-06, + "loss": 0.4779, + "step": 214 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 3.534566879272461, + "learning_rate": 4.790349190823313e-06, + "loss": 0.5464, + "step": 215 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 3.0323140621185303, + "learning_rate": 4.788413440641297e-06, + "loss": 0.6198, + "step": 216 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 2.612746238708496, + "learning_rate": 4.786469189677026e-06, + "loss": 0.6695, + "step": 217 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 3.0299434661865234, + "learning_rate": 4.784516445152821e-06, + "loss": 0.4902, + "step": 218 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 3.4521942138671875, + "learning_rate": 4.78255521432255e-06, + "loss": 0.7411, + "step": 219 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 2.6712653636932373, + "learning_rate": 4.780585504471612e-06, + "loss": 0.8767, + "step": 220 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 2.5099475383758545, + "learning_rate": 4.778607322916896e-06, + "loss": 0.4266, + "step": 221 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 2.641799211502075, + "learning_rate": 4.776620677006766e-06, + "loss": 0.4982, + "step": 222 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 3.1119771003723145, + "learning_rate": 4.7746255741210256e-06, + "loss": 0.6012, + "step": 223 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 3.9957170486450195, + "learning_rate": 4.772622021670897e-06, + "loss": 0.7585, + "step": 224 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 3.1070823669433594, + "learning_rate": 4.770610027098983e-06, + "loss": 0.5266, + "step": 225 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 2.7630460262298584, + "learning_rate": 4.7685895978792564e-06, + "loss": 0.6261, + "step": 226 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 2.6509556770324707, + "learning_rate": 4.766560741517014e-06, + "loss": 0.7081, + "step": 227 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 3.0212976932525635, + "learning_rate": 4.76452346554886e-06, + "loss": 0.5041, + "step": 228 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 3.0454728603363037, + "learning_rate": 4.762477777542676e-06, + "loss": 0.49, + "step": 229 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 3.4296791553497314, + "learning_rate": 4.7604236850975905e-06, + "loss": 0.7056, + "step": 230 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 4.1885600090026855, + "learning_rate": 4.7583611958439514e-06, + "loss": 0.7762, + "step": 231 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 3.065854072570801, + "learning_rate": 4.7562903174433e-06, + "loss": 0.5347, + "step": 232 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 2.793851852416992, + "learning_rate": 4.75421105758834e-06, + "loss": 0.503, + "step": 233 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 3.123730421066284, + "learning_rate": 4.752123424002908e-06, + "loss": 0.5081, + "step": 234 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 3.230161666870117, + "learning_rate": 4.750027424441949e-06, + "loss": 0.7523, + "step": 235 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 2.4970247745513916, + "learning_rate": 4.747923066691487e-06, + "loss": 0.5575, + "step": 236 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 2.9880685806274414, + "learning_rate": 4.745810358568588e-06, + "loss": 0.7264, + "step": 237 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 2.555328369140625, + "learning_rate": 4.743689307921342e-06, + "loss": 0.4545, + "step": 238 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 3.144932746887207, + "learning_rate": 4.741559922628828e-06, + "loss": 0.5429, + "step": 239 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 3.059807062149048, + "learning_rate": 4.739422210601085e-06, + "loss": 0.5086, + "step": 240 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 3.374303102493286, + "learning_rate": 4.7372761797790836e-06, + "loss": 0.6109, + "step": 241 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 2.4506947994232178, + "learning_rate": 4.735121838134697e-06, + "loss": 0.4317, + "step": 242 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 2.9039974212646484, + "learning_rate": 4.732959193670672e-06, + "loss": 0.6414, + "step": 243 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 2.9412453174591064, + "learning_rate": 4.730788254420593e-06, + "loss": 0.5166, + "step": 244 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 2.500716209411621, + "learning_rate": 4.728609028448862e-06, + "loss": 0.4982, + "step": 245 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 2.4233803749084473, + "learning_rate": 4.726421523850662e-06, + "loss": 0.7552, + "step": 246 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 2.357003688812256, + "learning_rate": 4.7242257487519275e-06, + "loss": 0.4365, + "step": 247 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 2.6406495571136475, + "learning_rate": 4.722021711309317e-06, + "loss": 0.6002, + "step": 248 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 2.736884832382202, + "learning_rate": 4.7198094197101826e-06, + "loss": 0.4993, + "step": 249 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 3.5238845348358154, + "learning_rate": 4.7175888821725335e-06, + "loss": 0.4637, + "step": 250 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 3.3783695697784424, + "learning_rate": 4.715360106945015e-06, + "loss": 0.9711, + "step": 251 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 2.9685862064361572, + "learning_rate": 4.713123102306869e-06, + "loss": 0.5452, + "step": 252 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 3.143733263015747, + "learning_rate": 4.710877876567912e-06, + "loss": 0.5034, + "step": 253 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 2.8005623817443848, + "learning_rate": 4.708624438068494e-06, + "loss": 0.4236, + "step": 254 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 2.66581130027771, + "learning_rate": 4.706362795179476e-06, + "loss": 0.6095, + "step": 255 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 4.598043441772461, + "learning_rate": 4.7040929563021975e-06, + "loss": 0.738, + "step": 256 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 3.5643506050109863, + "learning_rate": 4.70181492986844e-06, + "loss": 0.6726, + "step": 257 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 2.865339994430542, + "learning_rate": 4.699528724340401e-06, + "loss": 0.4862, + "step": 258 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 2.95529842376709, + "learning_rate": 4.6972343482106615e-06, + "loss": 0.5003, + "step": 259 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 2.45206356048584, + "learning_rate": 4.6949318100021546e-06, + "loss": 0.6734, + "step": 260 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 2.6789939403533936, + "learning_rate": 4.6926211182681295e-06, + "loss": 0.5639, + "step": 261 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 3.307732582092285, + "learning_rate": 4.690302281592128e-06, + "loss": 0.7032, + "step": 262 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 2.8950445652008057, + "learning_rate": 4.687975308587944e-06, + "loss": 0.4937, + "step": 263 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 2.969377040863037, + "learning_rate": 4.685640207899598e-06, + "loss": 0.5829, + "step": 264 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 3.106433391571045, + "learning_rate": 4.683296988201301e-06, + "loss": 0.3805, + "step": 265 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 3.5599050521850586, + "learning_rate": 4.680945658197425e-06, + "loss": 0.7939, + "step": 266 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 5.008603096008301, + "learning_rate": 4.6785862266224695e-06, + "loss": 0.7511, + "step": 267 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 3.1393773555755615, + "learning_rate": 4.676218702241026e-06, + "loss": 0.8984, + "step": 268 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 3.0241408348083496, + "learning_rate": 4.673843093847753e-06, + "loss": 0.5473, + "step": 269 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 2.9029417037963867, + "learning_rate": 4.6714594102673355e-06, + "loss": 0.6626, + "step": 270 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 3.4709246158599854, + "learning_rate": 4.669067660354456e-06, + "loss": 0.5015, + "step": 271 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 2.988635778427124, + "learning_rate": 4.666667852993761e-06, + "loss": 0.5384, + "step": 272 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 3.418140411376953, + "learning_rate": 4.664259997099829e-06, + "loss": 0.7491, + "step": 273 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 2.592416763305664, + "learning_rate": 4.661844101617135e-06, + "loss": 0.6451, + "step": 274 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 3.1174306869506836, + "learning_rate": 4.6594201755200205e-06, + "loss": 0.6299, + "step": 275 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 2.6569998264312744, + "learning_rate": 4.656988227812658e-06, + "loss": 0.4477, + "step": 276 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 3.5733959674835205, + "learning_rate": 4.654548267529015e-06, + "loss": 0.5473, + "step": 277 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 2.7240824699401855, + "learning_rate": 4.652100303732827e-06, + "loss": 0.496, + "step": 278 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 4.1965460777282715, + "learning_rate": 4.64964434551756e-06, + "loss": 0.932, + "step": 279 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 2.3237173557281494, + "learning_rate": 4.647180402006372e-06, + "loss": 0.4648, + "step": 280 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 3.395045042037964, + "learning_rate": 4.644708482352093e-06, + "loss": 0.7237, + "step": 281 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 3.238593816757202, + "learning_rate": 4.6422285957371735e-06, + "loss": 0.5531, + "step": 282 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 3.9651403427124023, + "learning_rate": 4.639740751373663e-06, + "loss": 0.6706, + "step": 283 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 3.0042061805725098, + "learning_rate": 4.63724495850317e-06, + "loss": 0.56, + "step": 284 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 3.094310760498047, + "learning_rate": 4.634741226396832e-06, + "loss": 0.6138, + "step": 285 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 2.838168144226074, + "learning_rate": 4.632229564355275e-06, + "loss": 0.4908, + "step": 286 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 3.3452796936035156, + "learning_rate": 4.629709981708586e-06, + "loss": 0.8181, + "step": 287 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 2.6630783081054688, + "learning_rate": 4.6271824878162704e-06, + "loss": 0.5625, + "step": 288 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 2.583650588989258, + "learning_rate": 4.624647092067226e-06, + "loss": 0.3416, + "step": 289 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 2.73132586479187, + "learning_rate": 4.622103803879702e-06, + "loss": 0.3889, + "step": 290 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 4.1010260581970215, + "learning_rate": 4.619552632701263e-06, + "loss": 0.611, + "step": 291 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 4.53068208694458, + "learning_rate": 4.61699358800876e-06, + "loss": 0.7219, + "step": 292 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 3.4877254962921143, + "learning_rate": 4.614426679308291e-06, + "loss": 0.6402, + "step": 293 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 2.9445226192474365, + "learning_rate": 4.611851916135166e-06, + "loss": 0.509, + "step": 294 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 2.6622228622436523, + "learning_rate": 4.609269308053872e-06, + "loss": 0.6167, + "step": 295 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 3.131530523300171, + "learning_rate": 4.606678864658039e-06, + "loss": 0.8039, + "step": 296 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 3.212188482284546, + "learning_rate": 4.604080595570399e-06, + "loss": 0.5754, + "step": 297 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 3.522850275039673, + "learning_rate": 4.601474510442759e-06, + "loss": 0.4432, + "step": 298 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 2.5877151489257812, + "learning_rate": 4.598860618955957e-06, + "loss": 0.6541, + "step": 299 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 2.803833484649658, + "learning_rate": 4.596238930819832e-06, + "loss": 0.5824, + "step": 300 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 2.7125494480133057, + "learning_rate": 4.5936094557731815e-06, + "loss": 0.6976, + "step": 301 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 3.6549370288848877, + "learning_rate": 4.590972203583732e-06, + "loss": 0.7105, + "step": 302 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 3.3241944313049316, + "learning_rate": 4.588327184048099e-06, + "loss": 0.7446, + "step": 303 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 2.8388822078704834, + "learning_rate": 4.585674406991752e-06, + "loss": 0.4926, + "step": 304 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 2.9760420322418213, + "learning_rate": 4.5830138822689755e-06, + "loss": 0.7368, + "step": 305 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 2.5437633991241455, + "learning_rate": 4.5803456197628374e-06, + "loss": 0.4678, + "step": 306 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 3.0044775009155273, + "learning_rate": 4.577669629385145e-06, + "loss": 0.4241, + "step": 307 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 2.6150901317596436, + "learning_rate": 4.574985921076418e-06, + "loss": 0.5327, + "step": 308 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 2.4425182342529297, + "learning_rate": 4.572294504805841e-06, + "loss": 0.7504, + "step": 309 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 2.9920194149017334, + "learning_rate": 4.569595390571232e-06, + "loss": 0.5194, + "step": 310 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 2.701087713241577, + "learning_rate": 4.566888588399007e-06, + "loss": 0.6862, + "step": 311 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 7.628893852233887, + "learning_rate": 4.564174108344139e-06, + "loss": 0.6867, + "step": 312 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 2.712947130203247, + "learning_rate": 4.561451960490123e-06, + "loss": 0.6942, + "step": 313 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 3.0063202381134033, + "learning_rate": 4.558722154948937e-06, + "loss": 0.6346, + "step": 314 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 2.957218647003174, + "learning_rate": 4.5559847018610034e-06, + "loss": 0.464, + "step": 315 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 3.322282552719116, + "learning_rate": 4.553239611395156e-06, + "loss": 0.6334, + "step": 316 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 3.0638647079467773, + "learning_rate": 4.550486893748596e-06, + "loss": 0.4227, + "step": 317 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 3.079087257385254, + "learning_rate": 4.547726559146862e-06, + "loss": 0.3719, + "step": 318 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 2.409914255142212, + "learning_rate": 4.544958617843782e-06, + "loss": 0.3331, + "step": 319 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 3.3441262245178223, + "learning_rate": 4.542183080121444e-06, + "loss": 0.6931, + "step": 320 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 2.6624436378479004, + "learning_rate": 4.539399956290152e-06, + "loss": 0.6578, + "step": 321 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 3.463789224624634, + "learning_rate": 4.536609256688396e-06, + "loss": 0.5748, + "step": 322 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 3.6827807426452637, + "learning_rate": 4.533810991682799e-06, + "loss": 0.5249, + "step": 323 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 4.125547409057617, + "learning_rate": 4.531005171668093e-06, + "loss": 0.3065, + "step": 324 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 2.935978412628174, + "learning_rate": 4.528191807067074e-06, + "loss": 0.5523, + "step": 325 + }, + { + "epoch": 2.0, + "grad_norm": 2.654388427734375, + "learning_rate": 4.525370908330564e-06, + "loss": 0.4157, + "step": 326 + }, + { + "epoch": 2.0061349693251533, + "grad_norm": 3.213925838470459, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4243, + "step": 327 + }, + { + "epoch": 2.0122699386503067, + "grad_norm": 3.5483286380767822, + "learning_rate": 4.519706550394248e-06, + "loss": 0.4137, + "step": 328 + }, + { + "epoch": 2.01840490797546, + "grad_norm": 3.32084059715271, + "learning_rate": 4.516863112235864e-06, + "loss": 0.5389, + "step": 329 + }, + { + "epoch": 2.0245398773006134, + "grad_norm": 3.427666425704956, + "learning_rate": 4.514012182024756e-06, + "loss": 0.285, + "step": 330 + }, + { + "epoch": 2.0306748466257667, + "grad_norm": 3.3269975185394287, + "learning_rate": 4.511153770351288e-06, + "loss": 0.4877, + "step": 331 + }, + { + "epoch": 2.03680981595092, + "grad_norm": 5.258850574493408, + "learning_rate": 4.508287887833619e-06, + "loss": 0.5168, + "step": 332 + }, + { + "epoch": 2.042944785276074, + "grad_norm": 4.316092491149902, + "learning_rate": 4.505414545117658e-06, + "loss": 0.4791, + "step": 333 + }, + { + "epoch": 2.049079754601227, + "grad_norm": 3.952056884765625, + "learning_rate": 4.502533752877028e-06, + "loss": 0.3014, + "step": 334 + }, + { + "epoch": 2.0552147239263805, + "grad_norm": 4.0617194175720215, + "learning_rate": 4.499645521813024e-06, + "loss": 0.4313, + "step": 335 + }, + { + "epoch": 2.061349693251534, + "grad_norm": 3.7869274616241455, + "learning_rate": 4.496749862654574e-06, + "loss": 0.4807, + "step": 336 + }, + { + "epoch": 2.067484662576687, + "grad_norm": 3.8181991577148438, + "learning_rate": 4.4938467861582e-06, + "loss": 0.4002, + "step": 337 + }, + { + "epoch": 2.0736196319018405, + "grad_norm": 3.8289854526519775, + "learning_rate": 4.490936303107975e-06, + "loss": 0.618, + "step": 338 + }, + { + "epoch": 2.079754601226994, + "grad_norm": 3.121443271636963, + "learning_rate": 4.488018424315488e-06, + "loss": 0.4203, + "step": 339 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 3.141782283782959, + "learning_rate": 4.4850931606198e-06, + "loss": 0.3618, + "step": 340 + }, + { + "epoch": 2.0920245398773005, + "grad_norm": 3.1279287338256836, + "learning_rate": 4.482160522887404e-06, + "loss": 0.4571, + "step": 341 + }, + { + "epoch": 2.098159509202454, + "grad_norm": 3.2418482303619385, + "learning_rate": 4.479220522012185e-06, + "loss": 0.2674, + "step": 342 + }, + { + "epoch": 2.104294478527607, + "grad_norm": 10.230683326721191, + "learning_rate": 4.476273168915382e-06, + "loss": 0.5479, + "step": 343 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 3.588361978530884, + "learning_rate": 4.473318474545544e-06, + "loss": 0.3654, + "step": 344 + }, + { + "epoch": 2.116564417177914, + "grad_norm": 3.0913164615631104, + "learning_rate": 4.470356449878489e-06, + "loss": 0.2704, + "step": 345 + }, + { + "epoch": 2.1226993865030677, + "grad_norm": 3.972447633743286, + "learning_rate": 4.467387105917269e-06, + "loss": 0.3029, + "step": 346 + }, + { + "epoch": 2.128834355828221, + "grad_norm": 3.7174713611602783, + "learning_rate": 4.464410453692122e-06, + "loss": 0.6536, + "step": 347 + }, + { + "epoch": 2.1349693251533743, + "grad_norm": 3.9333994388580322, + "learning_rate": 4.461426504260434e-06, + "loss": 0.3806, + "step": 348 + }, + { + "epoch": 2.1411042944785277, + "grad_norm": 4.752816200256348, + "learning_rate": 4.458435268706699e-06, + "loss": 0.4019, + "step": 349 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 2.505603790283203, + "learning_rate": 4.455436758142477e-06, + "loss": 0.2348, + "step": 350 + }, + { + "epoch": 2.1533742331288344, + "grad_norm": 3.3050570487976074, + "learning_rate": 4.452430983706351e-06, + "loss": 0.505, + "step": 351 + }, + { + "epoch": 2.1595092024539877, + "grad_norm": 5.387442588806152, + "learning_rate": 4.44941795656389e-06, + "loss": 0.399, + "step": 352 + }, + { + "epoch": 2.165644171779141, + "grad_norm": 3.4759480953216553, + "learning_rate": 4.446397687907601e-06, + "loss": 0.5664, + "step": 353 + }, + { + "epoch": 2.1717791411042944, + "grad_norm": 2.949445962905884, + "learning_rate": 4.4433701889568935e-06, + "loss": 0.2128, + "step": 354 + }, + { + "epoch": 2.1779141104294477, + "grad_norm": 3.2884252071380615, + "learning_rate": 4.440335470958035e-06, + "loss": 0.3138, + "step": 355 + }, + { + "epoch": 2.184049079754601, + "grad_norm": 3.1605632305145264, + "learning_rate": 4.437293545184111e-06, + "loss": 0.349, + "step": 356 + }, + { + "epoch": 2.190184049079755, + "grad_norm": 2.9996821880340576, + "learning_rate": 4.434244422934976e-06, + "loss": 0.343, + "step": 357 + }, + { + "epoch": 2.196319018404908, + "grad_norm": 3.6373324394226074, + "learning_rate": 4.431188115537226e-06, + "loss": 0.5656, + "step": 358 + }, + { + "epoch": 2.2024539877300615, + "grad_norm": 4.667621612548828, + "learning_rate": 4.428124634344141e-06, + "loss": 0.2335, + "step": 359 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 3.815484046936035, + "learning_rate": 4.425053990735653e-06, + "loss": 0.2173, + "step": 360 + }, + { + "epoch": 2.214723926380368, + "grad_norm": 4.689478874206543, + "learning_rate": 4.421976196118297e-06, + "loss": 0.5071, + "step": 361 + }, + { + "epoch": 2.2208588957055215, + "grad_norm": 4.016942024230957, + "learning_rate": 4.4188912619251765e-06, + "loss": 0.384, + "step": 362 + }, + { + "epoch": 2.226993865030675, + "grad_norm": 3.5336828231811523, + "learning_rate": 4.415799199615912e-06, + "loss": 0.3133, + "step": 363 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 2.9195592403411865, + "learning_rate": 4.4127000206766055e-06, + "loss": 0.3847, + "step": 364 + }, + { + "epoch": 2.2392638036809815, + "grad_norm": 2.6843531131744385, + "learning_rate": 4.409593736619795e-06, + "loss": 0.3539, + "step": 365 + }, + { + "epoch": 2.245398773006135, + "grad_norm": 2.8692703247070312, + "learning_rate": 4.40648035898441e-06, + "loss": 0.3664, + "step": 366 + }, + { + "epoch": 2.2515337423312882, + "grad_norm": 2.820422649383545, + "learning_rate": 4.403359899335732e-06, + "loss": 0.4606, + "step": 367 + }, + { + "epoch": 2.2576687116564416, + "grad_norm": 3.8641669750213623, + "learning_rate": 4.400232369265351e-06, + "loss": 0.2931, + "step": 368 + }, + { + "epoch": 2.263803680981595, + "grad_norm": 2.75347638130188, + "learning_rate": 4.39709778039112e-06, + "loss": 0.3393, + "step": 369 + }, + { + "epoch": 2.2699386503067487, + "grad_norm": 15.150428771972656, + "learning_rate": 4.393956144357113e-06, + "loss": 0.65, + "step": 370 + }, + { + "epoch": 2.276073619631902, + "grad_norm": 2.4876065254211426, + "learning_rate": 4.390807472833585e-06, + "loss": 0.372, + "step": 371 + }, + { + "epoch": 2.2822085889570554, + "grad_norm": 2.7328054904937744, + "learning_rate": 4.3876517775169216e-06, + "loss": 0.2802, + "step": 372 + }, + { + "epoch": 2.2883435582822087, + "grad_norm": 2.903221368789673, + "learning_rate": 4.384489070129604e-06, + "loss": 0.1964, + "step": 373 + }, + { + "epoch": 2.294478527607362, + "grad_norm": 3.9368724822998047, + "learning_rate": 4.381319362420158e-06, + "loss": 0.4272, + "step": 374 + }, + { + "epoch": 2.3006134969325154, + "grad_norm": 5.431981086730957, + "learning_rate": 4.378142666163114e-06, + "loss": 0.4513, + "step": 375 + }, + { + "epoch": 2.3067484662576687, + "grad_norm": 3.661733627319336, + "learning_rate": 4.374958993158965e-06, + "loss": 0.6087, + "step": 376 + }, + { + "epoch": 2.312883435582822, + "grad_norm": 3.004450559616089, + "learning_rate": 4.371768355234116e-06, + "loss": 0.2206, + "step": 377 + }, + { + "epoch": 2.3190184049079754, + "grad_norm": 4.3785576820373535, + "learning_rate": 4.368570764240852e-06, + "loss": 0.6055, + "step": 378 + }, + { + "epoch": 2.3251533742331287, + "grad_norm": 3.4699394702911377, + "learning_rate": 4.365366232057279e-06, + "loss": 0.6286, + "step": 379 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 2.6862998008728027, + "learning_rate": 4.3621547705872915e-06, + "loss": 0.2622, + "step": 380 + }, + { + "epoch": 2.3374233128834354, + "grad_norm": 3.056382179260254, + "learning_rate": 4.358936391760524e-06, + "loss": 0.3439, + "step": 381 + }, + { + "epoch": 2.3435582822085887, + "grad_norm": 2.6211307048797607, + "learning_rate": 4.355711107532305e-06, + "loss": 0.3677, + "step": 382 + }, + { + "epoch": 2.3496932515337425, + "grad_norm": 2.682060956954956, + "learning_rate": 4.3524789298836175e-06, + "loss": 0.3068, + "step": 383 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 3.482539415359497, + "learning_rate": 4.349239870821049e-06, + "loss": 0.3737, + "step": 384 + }, + { + "epoch": 2.361963190184049, + "grad_norm": 2.8645472526550293, + "learning_rate": 4.345993942376752e-06, + "loss": 0.2837, + "step": 385 + }, + { + "epoch": 2.3680981595092025, + "grad_norm": 3.6142354011535645, + "learning_rate": 4.342741156608392e-06, + "loss": 0.6162, + "step": 386 + }, + { + "epoch": 2.374233128834356, + "grad_norm": 3.0748162269592285, + "learning_rate": 4.3394815255991135e-06, + "loss": 0.2986, + "step": 387 + }, + { + "epoch": 2.3803680981595092, + "grad_norm": 5.090906620025635, + "learning_rate": 4.336215061457485e-06, + "loss": 0.5383, + "step": 388 + }, + { + "epoch": 2.3865030674846626, + "grad_norm": 3.9235823154449463, + "learning_rate": 4.332941776317458e-06, + "loss": 0.4179, + "step": 389 + }, + { + "epoch": 2.392638036809816, + "grad_norm": 3.482926368713379, + "learning_rate": 4.329661682338325e-06, + "loss": 0.3938, + "step": 390 + }, + { + "epoch": 2.3987730061349692, + "grad_norm": 4.274583339691162, + "learning_rate": 4.32637479170467e-06, + "loss": 0.3349, + "step": 391 + }, + { + "epoch": 2.4049079754601226, + "grad_norm": 3.326012372970581, + "learning_rate": 4.323081116626322e-06, + "loss": 0.3336, + "step": 392 + }, + { + "epoch": 2.411042944785276, + "grad_norm": 3.174591541290283, + "learning_rate": 4.319780669338316e-06, + "loss": 0.2983, + "step": 393 + }, + { + "epoch": 2.4171779141104293, + "grad_norm": 3.9073634147644043, + "learning_rate": 4.31647346210084e-06, + "loss": 0.8401, + "step": 394 + }, + { + "epoch": 2.4233128834355826, + "grad_norm": 3.4787721633911133, + "learning_rate": 4.313159507199197e-06, + "loss": 0.2583, + "step": 395 + }, + { + "epoch": 2.4294478527607364, + "grad_norm": 3.19903564453125, + "learning_rate": 4.309838816943755e-06, + "loss": 0.2861, + "step": 396 + }, + { + "epoch": 2.4355828220858897, + "grad_norm": 3.184246778488159, + "learning_rate": 4.306511403669897e-06, + "loss": 0.2956, + "step": 397 + }, + { + "epoch": 2.441717791411043, + "grad_norm": 3.8991878032684326, + "learning_rate": 4.303177279737988e-06, + "loss": 0.5378, + "step": 398 + }, + { + "epoch": 2.4478527607361964, + "grad_norm": 3.411949872970581, + "learning_rate": 4.299836457533313e-06, + "loss": 0.3423, + "step": 399 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 3.445502996444702, + "learning_rate": 4.296488949466046e-06, + "loss": 0.5608, + "step": 400 + }, + { + "epoch": 2.460122699386503, + "grad_norm": 3.066798210144043, + "learning_rate": 4.293134767971193e-06, + "loss": 0.3214, + "step": 401 + }, + { + "epoch": 2.4662576687116564, + "grad_norm": 3.0581583976745605, + "learning_rate": 4.28977392550855e-06, + "loss": 0.5117, + "step": 402 + }, + { + "epoch": 2.4723926380368098, + "grad_norm": 4.207413673400879, + "learning_rate": 4.286406434562659e-06, + "loss": 0.2666, + "step": 403 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 2.9934990406036377, + "learning_rate": 4.283032307642756e-06, + "loss": 0.2878, + "step": 404 + }, + { + "epoch": 2.4846625766871164, + "grad_norm": 3.800593614578247, + "learning_rate": 4.2796515572827305e-06, + "loss": 0.2619, + "step": 405 + }, + { + "epoch": 2.4907975460122698, + "grad_norm": 3.2029523849487305, + "learning_rate": 4.276264196041074e-06, + "loss": 0.1735, + "step": 406 + }, + { + "epoch": 2.4969325153374236, + "grad_norm": 3.515634059906006, + "learning_rate": 4.2728702365008356e-06, + "loss": 0.4741, + "step": 407 + }, + { + "epoch": 2.5030674846625764, + "grad_norm": 3.8354873657226562, + "learning_rate": 4.269469691269577e-06, + "loss": 0.3713, + "step": 408 + }, + { + "epoch": 2.5092024539877302, + "grad_norm": 3.902904510498047, + "learning_rate": 4.266062572979323e-06, + "loss": 0.5189, + "step": 409 + }, + { + "epoch": 2.5153374233128836, + "grad_norm": 3.3276097774505615, + "learning_rate": 4.262648894286515e-06, + "loss": 0.2461, + "step": 410 + }, + { + "epoch": 2.521472392638037, + "grad_norm": 2.9457011222839355, + "learning_rate": 4.259228667871963e-06, + "loss": 0.3013, + "step": 411 + }, + { + "epoch": 2.5276073619631902, + "grad_norm": 2.8941617012023926, + "learning_rate": 4.255801906440803e-06, + "loss": 0.2784, + "step": 412 + }, + { + "epoch": 2.5337423312883436, + "grad_norm": 2.949399471282959, + "learning_rate": 4.252368622722443e-06, + "loss": 0.457, + "step": 413 + }, + { + "epoch": 2.539877300613497, + "grad_norm": 3.342108726501465, + "learning_rate": 4.248928829470522e-06, + "loss": 0.487, + "step": 414 + }, + { + "epoch": 2.5460122699386503, + "grad_norm": 3.9556386470794678, + "learning_rate": 4.245482539462861e-06, + "loss": 0.6118, + "step": 415 + }, + { + "epoch": 2.5521472392638036, + "grad_norm": 3.6936280727386475, + "learning_rate": 4.242029765501411e-06, + "loss": 0.6131, + "step": 416 + }, + { + "epoch": 2.558282208588957, + "grad_norm": 2.79897403717041, + "learning_rate": 4.2385705204122104e-06, + "loss": 0.4209, + "step": 417 + }, + { + "epoch": 2.5644171779141103, + "grad_norm": 4.093318462371826, + "learning_rate": 4.235104817045338e-06, + "loss": 0.5375, + "step": 418 + }, + { + "epoch": 2.5705521472392636, + "grad_norm": 3.138263463973999, + "learning_rate": 4.231632668274861e-06, + "loss": 0.4682, + "step": 419 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 3.1465651988983154, + "learning_rate": 4.22815408699879e-06, + "loss": 0.2522, + "step": 420 + }, + { + "epoch": 2.5828220858895703, + "grad_norm": 3.5166101455688477, + "learning_rate": 4.22466908613903e-06, + "loss": 0.4776, + "step": 421 + }, + { + "epoch": 2.588957055214724, + "grad_norm": 2.8498189449310303, + "learning_rate": 4.221177678641333e-06, + "loss": 0.3067, + "step": 422 + }, + { + "epoch": 2.5950920245398774, + "grad_norm": 2.8046035766601562, + "learning_rate": 4.217679877475251e-06, + "loss": 0.2402, + "step": 423 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 4.204788684844971, + "learning_rate": 4.214175695634084e-06, + "loss": 0.2608, + "step": 424 + }, + { + "epoch": 2.607361963190184, + "grad_norm": 2.5569400787353516, + "learning_rate": 4.210665146134838e-06, + "loss": 0.2801, + "step": 425 + }, + { + "epoch": 2.6134969325153374, + "grad_norm": 3.5359091758728027, + "learning_rate": 4.20714824201817e-06, + "loss": 0.2027, + "step": 426 + }, + { + "epoch": 2.6196319018404908, + "grad_norm": 3.5132668018341064, + "learning_rate": 4.203624996348343e-06, + "loss": 0.4253, + "step": 427 + }, + { + "epoch": 2.625766871165644, + "grad_norm": 3.5076472759246826, + "learning_rate": 4.200095422213177e-06, + "loss": 0.3014, + "step": 428 + }, + { + "epoch": 2.6319018404907975, + "grad_norm": 3.6501238346099854, + "learning_rate": 4.196559532724004e-06, + "loss": 0.6526, + "step": 429 + }, + { + "epoch": 2.638036809815951, + "grad_norm": 2.849924325942993, + "learning_rate": 4.193017341015608e-06, + "loss": 0.4487, + "step": 430 + }, + { + "epoch": 2.644171779141104, + "grad_norm": 3.2228448390960693, + "learning_rate": 4.189468860246192e-06, + "loss": 0.5386, + "step": 431 + }, + { + "epoch": 2.6503067484662575, + "grad_norm": 2.532102108001709, + "learning_rate": 4.185914103597316e-06, + "loss": 0.3034, + "step": 432 + }, + { + "epoch": 2.6564417177914113, + "grad_norm": 2.862720251083374, + "learning_rate": 4.182353084273855e-06, + "loss": 0.5862, + "step": 433 + }, + { + "epoch": 2.662576687116564, + "grad_norm": 3.4617464542388916, + "learning_rate": 4.178785815503946e-06, + "loss": 0.3954, + "step": 434 + }, + { + "epoch": 2.668711656441718, + "grad_norm": 2.627758741378784, + "learning_rate": 4.1752123105389444e-06, + "loss": 0.4367, + "step": 435 + }, + { + "epoch": 2.6748466257668713, + "grad_norm": 3.2868380546569824, + "learning_rate": 4.171632582653368e-06, + "loss": 0.2997, + "step": 436 + }, + { + "epoch": 2.6809815950920246, + "grad_norm": 3.4260897636413574, + "learning_rate": 4.168046645144851e-06, + "loss": 0.3354, + "step": 437 + }, + { + "epoch": 2.687116564417178, + "grad_norm": 3.1415748596191406, + "learning_rate": 4.164454511334098e-06, + "loss": 0.5538, + "step": 438 + }, + { + "epoch": 2.6932515337423313, + "grad_norm": 3.3700919151306152, + "learning_rate": 4.160856194564828e-06, + "loss": 0.5731, + "step": 439 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 3.146968364715576, + "learning_rate": 4.157251708203728e-06, + "loss": 0.4429, + "step": 440 + }, + { + "epoch": 2.705521472392638, + "grad_norm": 3.7495830059051514, + "learning_rate": 4.153641065640402e-06, + "loss": 0.6361, + "step": 441 + }, + { + "epoch": 2.7116564417177913, + "grad_norm": 3.426499128341675, + "learning_rate": 4.150024280287327e-06, + "loss": 0.2418, + "step": 442 + }, + { + "epoch": 2.7177914110429446, + "grad_norm": 3.213719606399536, + "learning_rate": 4.146401365579795e-06, + "loss": 0.2549, + "step": 443 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 3.457742929458618, + "learning_rate": 4.142772334975868e-06, + "loss": 0.3822, + "step": 444 + }, + { + "epoch": 2.7300613496932513, + "grad_norm": 3.130410671234131, + "learning_rate": 4.139137201956324e-06, + "loss": 0.3107, + "step": 445 + }, + { + "epoch": 2.736196319018405, + "grad_norm": 2.7337112426757812, + "learning_rate": 4.1354959800246155e-06, + "loss": 0.2829, + "step": 446 + }, + { + "epoch": 2.7423312883435584, + "grad_norm": 3.427006483078003, + "learning_rate": 4.131848682706807e-06, + "loss": 0.3045, + "step": 447 + }, + { + "epoch": 2.7484662576687118, + "grad_norm": 3.3742318153381348, + "learning_rate": 4.128195323551536e-06, + "loss": 0.316, + "step": 448 + }, + { + "epoch": 2.754601226993865, + "grad_norm": 3.086738109588623, + "learning_rate": 4.1245359161299555e-06, + "loss": 0.5278, + "step": 449 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 3.4609954357147217, + "learning_rate": 4.120870474035687e-06, + "loss": 0.447, + "step": 450 + }, + { + "epoch": 2.766871165644172, + "grad_norm": 3.552663803100586, + "learning_rate": 4.1171990108847705e-06, + "loss": 0.6127, + "step": 451 + }, + { + "epoch": 2.773006134969325, + "grad_norm": 4.413427352905273, + "learning_rate": 4.113521540315609e-06, + "loss": 0.3304, + "step": 452 + }, + { + "epoch": 2.7791411042944785, + "grad_norm": 3.3408143520355225, + "learning_rate": 4.109838075988922e-06, + "loss": 0.5871, + "step": 453 + }, + { + "epoch": 2.785276073619632, + "grad_norm": 3.0659773349761963, + "learning_rate": 4.106148631587697e-06, + "loss": 0.3578, + "step": 454 + }, + { + "epoch": 2.791411042944785, + "grad_norm": 3.2854816913604736, + "learning_rate": 4.102453220817134e-06, + "loss": 0.4685, + "step": 455 + }, + { + "epoch": 2.7975460122699385, + "grad_norm": 3.4940855503082275, + "learning_rate": 4.098751857404595e-06, + "loss": 0.2818, + "step": 456 + }, + { + "epoch": 2.8036809815950923, + "grad_norm": 2.4630730152130127, + "learning_rate": 4.0950445550995566e-06, + "loss": 0.3497, + "step": 457 + }, + { + "epoch": 2.809815950920245, + "grad_norm": 3.3870959281921387, + "learning_rate": 4.091331327673554e-06, + "loss": 0.4954, + "step": 458 + }, + { + "epoch": 2.815950920245399, + "grad_norm": 2.3676836490631104, + "learning_rate": 4.087612188920135e-06, + "loss": 0.3884, + "step": 459 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 3.2477807998657227, + "learning_rate": 4.083887152654804e-06, + "loss": 0.375, + "step": 460 + }, + { + "epoch": 2.8282208588957056, + "grad_norm": 3.295673131942749, + "learning_rate": 4.080156232714976e-06, + "loss": 0.3272, + "step": 461 + }, + { + "epoch": 2.834355828220859, + "grad_norm": 2.800847291946411, + "learning_rate": 4.07641944295992e-06, + "loss": 0.2936, + "step": 462 + }, + { + "epoch": 2.8404907975460123, + "grad_norm": 3.443336009979248, + "learning_rate": 4.072676797270708e-06, + "loss": 0.2363, + "step": 463 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 3.1334242820739746, + "learning_rate": 4.0689283095501684e-06, + "loss": 0.4827, + "step": 464 + }, + { + "epoch": 2.852760736196319, + "grad_norm": 3.950672149658203, + "learning_rate": 4.06517399372283e-06, + "loss": 0.3163, + "step": 465 + }, + { + "epoch": 2.8588957055214723, + "grad_norm": 4.243579387664795, + "learning_rate": 4.061413863734869e-06, + "loss": 0.2827, + "step": 466 + }, + { + "epoch": 2.8650306748466257, + "grad_norm": 4.076017379760742, + "learning_rate": 4.057647933554063e-06, + "loss": 0.3466, + "step": 467 + }, + { + "epoch": 2.871165644171779, + "grad_norm": 2.846989631652832, + "learning_rate": 4.053876217169734e-06, + "loss": 0.4632, + "step": 468 + }, + { + "epoch": 2.8773006134969323, + "grad_norm": 2.74981689453125, + "learning_rate": 4.050098728592698e-06, + "loss": 0.2001, + "step": 469 + }, + { + "epoch": 2.883435582822086, + "grad_norm": 3.062068462371826, + "learning_rate": 4.046315481855211e-06, + "loss": 0.5425, + "step": 470 + }, + { + "epoch": 2.889570552147239, + "grad_norm": 2.8630964756011963, + "learning_rate": 4.0425264910109245e-06, + "loss": 0.424, + "step": 471 + }, + { + "epoch": 2.895705521472393, + "grad_norm": 3.537442922592163, + "learning_rate": 4.03873177013482e-06, + "loss": 0.2443, + "step": 472 + }, + { + "epoch": 2.901840490797546, + "grad_norm": 3.128535270690918, + "learning_rate": 4.034931333323173e-06, + "loss": 0.3734, + "step": 473 + }, + { + "epoch": 2.9079754601226995, + "grad_norm": 3.021897792816162, + "learning_rate": 4.031125194693484e-06, + "loss": 0.3762, + "step": 474 + }, + { + "epoch": 2.914110429447853, + "grad_norm": 3.0943546295166016, + "learning_rate": 4.0273133683844375e-06, + "loss": 0.3721, + "step": 475 + }, + { + "epoch": 2.920245398773006, + "grad_norm": 3.443448305130005, + "learning_rate": 4.023495868555848e-06, + "loss": 0.2868, + "step": 476 + }, + { + "epoch": 2.9263803680981595, + "grad_norm": 2.865227222442627, + "learning_rate": 4.0196727093886024e-06, + "loss": 0.5086, + "step": 477 + }, + { + "epoch": 2.932515337423313, + "grad_norm": 3.1272058486938477, + "learning_rate": 4.015843905084612e-06, + "loss": 0.4616, + "step": 478 + }, + { + "epoch": 2.938650306748466, + "grad_norm": 3.0584447383880615, + "learning_rate": 4.012009469866756e-06, + "loss": 0.403, + "step": 479 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 4.42616081237793, + "learning_rate": 4.008169417978836e-06, + "loss": 0.5801, + "step": 480 + }, + { + "epoch": 2.950920245398773, + "grad_norm": 2.8444535732269287, + "learning_rate": 4.004323763685511e-06, + "loss": 0.5808, + "step": 481 + }, + { + "epoch": 2.957055214723926, + "grad_norm": 2.591719627380371, + "learning_rate": 4.0004725212722565e-06, + "loss": 0.2584, + "step": 482 + }, + { + "epoch": 2.96319018404908, + "grad_norm": 2.5496113300323486, + "learning_rate": 3.996615705045302e-06, + "loss": 0.462, + "step": 483 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 2.9932925701141357, + "learning_rate": 3.992753329331588e-06, + "loss": 0.3502, + "step": 484 + }, + { + "epoch": 2.9754601226993866, + "grad_norm": 3.136871337890625, + "learning_rate": 3.9888854084786995e-06, + "loss": 0.5989, + "step": 485 + }, + { + "epoch": 2.98159509202454, + "grad_norm": 3.6654274463653564, + "learning_rate": 3.985011956854826e-06, + "loss": 0.6772, + "step": 486 + }, + { + "epoch": 2.9877300613496933, + "grad_norm": 2.5398948192596436, + "learning_rate": 3.9811329888487004e-06, + "loss": 0.4192, + "step": 487 + }, + { + "epoch": 2.9938650306748467, + "grad_norm": 4.89943790435791, + "learning_rate": 3.977248518869545e-06, + "loss": 0.4031, + "step": 488 + }, + { + "epoch": 3.0, + "grad_norm": 3.4729995727539062, + "learning_rate": 3.973358561347024e-06, + "loss": 0.7764, + "step": 489 + } + ], + "logging_steps": 1, + "max_steps": 1630, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2108401575408435e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..17dff7da7fd9ce0c2508f372ba60a87e064d249b --- /dev/null +++ b/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json @@ -0,0 +1,6880 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 978, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 5.908512115478516, + "learning_rate": 5e-06, + "loss": 0.9606, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 4.304474353790283, + "learning_rate": 4.999995356617983e-06, + "loss": 0.8609, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 5.63697624206543, + "learning_rate": 4.999981426489179e-06, + "loss": 1.3543, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 3.6674246788024902, + "learning_rate": 4.999958209665336e-06, + "loss": 0.787, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 48.14854431152344, + "learning_rate": 4.999925706232695e-06, + "loss": 1.7786, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 7.8689866065979, + "learning_rate": 4.999883916312e-06, + "loss": 1.2175, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 5.119968891143799, + "learning_rate": 4.9998328400584864e-06, + "loss": 0.8998, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 3.730757713317871, + "learning_rate": 4.999772477661888e-06, + "loss": 0.8419, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 27.314565658569336, + "learning_rate": 4.999702829346432e-06, + "loss": 1.7948, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.822697162628174, + "learning_rate": 4.999623895370843e-06, + "loss": 1.0461, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 4.71220588684082, + "learning_rate": 4.999535676028338e-06, + "loss": 1.0, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.2378087043762207, + "learning_rate": 4.999438171646624e-06, + "loss": 0.9475, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 3.475543737411499, + "learning_rate": 4.999331382587901e-06, + "loss": 0.8654, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 10.06365966796875, + "learning_rate": 4.999215309248861e-06, + "loss": 1.2042, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 3.785153865814209, + "learning_rate": 4.999089952060681e-06, + "loss": 0.8846, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 2.944488048553467, + "learning_rate": 4.998955311489025e-06, + "loss": 0.8805, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 39.89304733276367, + "learning_rate": 4.998811388034046e-06, + "loss": 1.5882, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 3.5883963108062744, + "learning_rate": 4.9986581822303746e-06, + "loss": 0.9222, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 6.972247123718262, + "learning_rate": 4.998495694647127e-06, + "loss": 1.4088, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 3.948991298675537, + "learning_rate": 4.998323925887895e-06, + "loss": 1.454, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 3.8690035343170166, + "learning_rate": 4.998142876590749e-06, + "loss": 0.6335, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 5.243765830993652, + "learning_rate": 4.997952547428236e-06, + "loss": 0.6725, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 3.5994043350219727, + "learning_rate": 4.997752939107372e-06, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 4.06965970993042, + "learning_rate": 4.997544052369642e-06, + "loss": 0.9683, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 3.3247246742248535, + "learning_rate": 4.997325887990999e-06, + "loss": 0.9414, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 5.811742782592773, + "learning_rate": 4.997098446781861e-06, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 2.661334753036499, + "learning_rate": 4.996861729587103e-06, + "loss": 0.7708, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.863943576812744, + "learning_rate": 4.996615737286061e-06, + "loss": 0.6995, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 20.376733779907227, + "learning_rate": 4.996360470792524e-06, + "loss": 1.2563, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 3.62265682220459, + "learning_rate": 4.996095931054731e-06, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 3.915076732635498, + "learning_rate": 4.9958221190553705e-06, + "loss": 0.9227, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 3.129855155944824, + "learning_rate": 4.995539035811572e-06, + "loss": 0.701, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 2.7532224655151367, + "learning_rate": 4.9952466823749076e-06, + "loss": 0.6491, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 2.8444128036499023, + "learning_rate": 4.9949450598313835e-06, + "loss": 0.8029, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.57743239402771, + "learning_rate": 4.994634169301439e-06, + "loss": 0.8785, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 3.280055284500122, + "learning_rate": 4.994314011939941e-06, + "loss": 1.034, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 2.455838680267334, + "learning_rate": 4.99398458893618e-06, + "loss": 0.8557, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 4.72681188583374, + "learning_rate": 4.993645901513865e-06, + "loss": 1.1904, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 3.0585641860961914, + "learning_rate": 4.993297950931121e-06, + "loss": 0.7668, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.4603540897369385, + "learning_rate": 4.9929407384804806e-06, + "loss": 0.8812, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.9702436923980713, + "learning_rate": 4.992574265488883e-06, + "loss": 0.8878, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 2.6973602771759033, + "learning_rate": 4.9921985333176694e-06, + "loss": 0.7251, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 2.5542335510253906, + "learning_rate": 4.991813543362572e-06, + "loss": 0.6638, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.7530782222747803, + "learning_rate": 4.991419297053716e-06, + "loss": 1.0725, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 2.6483025550842285, + "learning_rate": 4.991015795855611e-06, + "loss": 0.7238, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 3.434422492980957, + "learning_rate": 4.990603041267144e-06, + "loss": 0.9188, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.914340019226074, + "learning_rate": 4.990181034821578e-06, + "loss": 0.6158, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 2.7211625576019287, + "learning_rate": 4.98974977808654e-06, + "loss": 0.7165, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.8414249420166016, + "learning_rate": 4.989309272664026e-06, + "loss": 0.7277, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 3.683204412460327, + "learning_rate": 4.988859520190381e-06, + "loss": 0.9793, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 3.1732583045959473, + "learning_rate": 4.988400522336304e-06, + "loss": 0.8966, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 2.7789194583892822, + "learning_rate": 4.9879322808068365e-06, + "loss": 0.8191, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.754816770553589, + "learning_rate": 4.987454797341358e-06, + "loss": 0.6308, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.730104684829712, + "learning_rate": 4.98696807371358e-06, + "loss": 0.8226, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 3.2225449085235596, + "learning_rate": 4.986472111731536e-06, + "loss": 0.9184, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 3.2684760093688965, + "learning_rate": 4.985966913237581e-06, + "loss": 0.6593, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.43105411529541, + "learning_rate": 4.985452480108376e-06, + "loss": 0.6994, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 7.366360664367676, + "learning_rate": 4.984928814254889e-06, + "loss": 1.1374, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.81864333152771, + "learning_rate": 4.984395917622387e-06, + "loss": 0.8097, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 3.1107730865478516, + "learning_rate": 4.9838537921904206e-06, + "loss": 0.8511, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 2.460545301437378, + "learning_rate": 4.9833024399728295e-06, + "loss": 0.898, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.921992778778076, + "learning_rate": 4.982741863017722e-06, + "loss": 0.6671, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 3.3006443977355957, + "learning_rate": 4.982172063407479e-06, + "loss": 1.0559, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.642587661743164, + "learning_rate": 4.9815930432587365e-06, + "loss": 0.6663, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.905898094177246, + "learning_rate": 4.981004804722384e-06, + "loss": 0.6895, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.9174182415008545, + "learning_rate": 4.980407349983556e-06, + "loss": 0.7982, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 2.214322805404663, + "learning_rate": 4.979800681261619e-06, + "loss": 0.6808, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 2.7152462005615234, + "learning_rate": 4.9791848008101705e-06, + "loss": 0.567, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.5657734870910645, + "learning_rate": 4.978559710917024e-06, + "loss": 0.7745, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 3.9103832244873047, + "learning_rate": 4.977925413904205e-06, + "loss": 0.9815, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 4.610236644744873, + "learning_rate": 4.9772819121279395e-06, + "loss": 1.164, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 3.01170015335083, + "learning_rate": 4.976629207978648e-06, + "loss": 0.7587, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 3.175889253616333, + "learning_rate": 4.975967303880933e-06, + "loss": 0.58, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 2.503741502761841, + "learning_rate": 4.975296202293575e-06, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 2.6778078079223633, + "learning_rate": 4.974615905709518e-06, + "loss": 0.7352, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 5.950812816619873, + "learning_rate": 4.973926416655863e-06, + "loss": 1.0643, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 3.0165305137634277, + "learning_rate": 4.973227737693858e-06, + "loss": 0.6699, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 4.793259620666504, + "learning_rate": 4.972519871418894e-06, + "loss": 1.0315, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 3.632815361022949, + "learning_rate": 4.971802820460481e-06, + "loss": 0.7003, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 3.077507734298706, + "learning_rate": 4.971076587482254e-06, + "loss": 0.6776, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 3.3886241912841797, + "learning_rate": 4.970341175181957e-06, + "loss": 0.7422, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 2.71288800239563, + "learning_rate": 4.969596586291425e-06, + "loss": 0.7471, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.777920961380005, + "learning_rate": 4.968842823576592e-06, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 6.496985912322998, + "learning_rate": 4.968079889837461e-06, + "loss": 0.9965, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.6163430213928223, + "learning_rate": 4.967307787908108e-06, + "loss": 0.6833, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 3.244098663330078, + "learning_rate": 4.966526520656663e-06, + "loss": 0.8373, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.9027860164642334, + "learning_rate": 4.965736090985305e-06, + "loss": 0.8529, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 2.3786230087280273, + "learning_rate": 4.964936501830246e-06, + "loss": 0.6577, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 7.3099045753479, + "learning_rate": 4.964127756161727e-06, + "loss": 1.1184, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 3.068873167037964, + "learning_rate": 4.963309856983998e-06, + "loss": 0.7906, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 3.082547426223755, + "learning_rate": 4.9624828073353144e-06, + "loss": 0.8107, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.4586973190307617, + "learning_rate": 4.961646610287922e-06, + "loss": 0.7421, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 2.779277801513672, + "learning_rate": 4.960801268948047e-06, + "loss": 0.7134, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 3.2255213260650635, + "learning_rate": 4.959946786455882e-06, + "loss": 0.5875, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 2.783395528793335, + "learning_rate": 4.959083165985581e-06, + "loss": 0.6595, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 2.240114212036133, + "learning_rate": 4.958210410745237e-06, + "loss": 0.793, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.9399421215057373, + "learning_rate": 4.957328523976879e-06, + "loss": 0.5896, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 3.4449355602264404, + "learning_rate": 4.956437508956458e-06, + "loss": 0.8658, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 4.273710250854492, + "learning_rate": 4.9555373689938325e-06, + "loss": 0.8316, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.4222047328948975, + "learning_rate": 4.954628107432757e-06, + "loss": 1.0613, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 2.5318963527679443, + "learning_rate": 4.95370972765087e-06, + "loss": 0.7194, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 2.7852585315704346, + "learning_rate": 4.952782233059683e-06, + "loss": 0.5927, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 2.6532323360443115, + "learning_rate": 4.951845627104565e-06, + "loss": 0.8505, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 2.3213467597961426, + "learning_rate": 4.95089991326473e-06, + "loss": 0.8682, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 2.607992649078369, + "learning_rate": 4.9499450950532305e-06, + "loss": 0.8735, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 3.9820072650909424, + "learning_rate": 4.94898117601693e-06, + "loss": 1.0571, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 3.3878824710845947, + "learning_rate": 4.948008159736507e-06, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 2.6935670375823975, + "learning_rate": 4.94702604982643e-06, + "loss": 0.5968, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.78190016746521, + "learning_rate": 4.9460348499349485e-06, + "loss": 0.7504, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 2.973083972930908, + "learning_rate": 4.945034563744077e-06, + "loss": 0.6728, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 2.631803512573242, + "learning_rate": 4.944025194969586e-06, + "loss": 0.609, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 2.7443883419036865, + "learning_rate": 4.9430067473609825e-06, + "loss": 0.8713, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 2.543769121170044, + "learning_rate": 4.941979224701499e-06, + "loss": 0.8035, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 3.7799901962280273, + "learning_rate": 4.94094263080808e-06, + "loss": 0.9341, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 3.1234734058380127, + "learning_rate": 4.939896969531367e-06, + "loss": 1.1066, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 2.356036424636841, + "learning_rate": 4.938842244755683e-06, + "loss": 0.853, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 3.6231274604797363, + "learning_rate": 4.937778460399022e-06, + "loss": 0.9116, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 3.1277005672454834, + "learning_rate": 4.936705620413028e-06, + "loss": 0.5888, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.7338361740112305, + "learning_rate": 4.935623728782986e-06, + "loss": 0.592, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 2.748363733291626, + "learning_rate": 4.934532789527805e-06, + "loss": 0.8713, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 4.460031986236572, + "learning_rate": 4.933432806700004e-06, + "loss": 0.6791, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 2.392911911010742, + "learning_rate": 4.932323784385693e-06, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.7804384231567383, + "learning_rate": 4.931205726704566e-06, + "loss": 0.7547, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.7664780616760254, + "learning_rate": 4.930078637809878e-06, + "loss": 0.7849, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 2.592808723449707, + "learning_rate": 4.928942521888431e-06, + "loss": 0.7015, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 2.7080585956573486, + "learning_rate": 4.927797383160561e-06, + "loss": 1.0028, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.7941503524780273, + "learning_rate": 4.926643225880123e-06, + "loss": 0.602, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 3.2796623706817627, + "learning_rate": 4.925480054334471e-06, + "loss": 0.7473, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 2.7623610496520996, + "learning_rate": 4.924307872844444e-06, + "loss": 1.0573, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.6224453449249268, + "learning_rate": 4.923126685764351e-06, + "loss": 0.7399, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 17.736326217651367, + "learning_rate": 4.921936497481956e-06, + "loss": 0.9548, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.504213333129883, + "learning_rate": 4.920737312418456e-06, + "loss": 0.6748, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 3.617077350616455, + "learning_rate": 4.919529135028473e-06, + "loss": 0.8431, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.6559832096099854, + "learning_rate": 4.918311969800027e-06, + "loss": 0.7243, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 2.7539305686950684, + "learning_rate": 4.917085821254532e-06, + "loss": 0.7845, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 3.3587615489959717, + "learning_rate": 4.915850693946766e-06, + "loss": 0.4891, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 3.064354181289673, + "learning_rate": 4.914606592464865e-06, + "loss": 0.7917, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 3.2505199909210205, + "learning_rate": 4.9133535214303e-06, + "loss": 0.9681, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 3.8027830123901367, + "learning_rate": 4.91209148549786e-06, + "loss": 0.9275, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 2.4154372215270996, + "learning_rate": 4.910820489355637e-06, + "loss": 0.7259, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.892462968826294, + "learning_rate": 4.909540537725007e-06, + "loss": 0.6061, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 3.3398196697235107, + "learning_rate": 4.908251635360616e-06, + "loss": 1.0559, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 3.022512197494507, + "learning_rate": 4.906953787050354e-06, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 2.658661365509033, + "learning_rate": 4.905646997615347e-06, + "loss": 0.6234, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 3.454400062561035, + "learning_rate": 4.904331271909932e-06, + "loss": 0.8066, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 3.1300277709960938, + "learning_rate": 4.903006614821645e-06, + "loss": 0.6861, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 2.362537145614624, + "learning_rate": 4.901673031271194e-06, + "loss": 0.6112, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 3.375577688217163, + "learning_rate": 4.900330526212451e-06, + "loss": 0.6314, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 2.955656051635742, + "learning_rate": 4.898979104632427e-06, + "loss": 0.889, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 2.9285926818847656, + "learning_rate": 4.897618771551255e-06, + "loss": 0.6406, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.131819725036621, + "learning_rate": 4.8962495320221714e-06, + "loss": 0.6368, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 2.780649185180664, + "learning_rate": 4.8948713911315e-06, + "loss": 0.8642, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 2.941500186920166, + "learning_rate": 4.8934843539986266e-06, + "loss": 0.714, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.892088425775986e-06, + "loss": 0.8365, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 2.6887171268463135, + "learning_rate": 4.890683611649041e-06, + "loss": 0.7937, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 3.7638463973999023, + "learning_rate": 4.8892699168362626e-06, + "loss": 0.7485, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.8132755756378174, + "learning_rate": 4.887847346589111e-06, + "loss": 0.6467, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.652247190475464, + "learning_rate": 4.886415906192015e-06, + "loss": 0.4651, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 2.5854647159576416, + "learning_rate": 4.884975600962355e-06, + "loss": 0.8756, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 3.1630544662475586, + "learning_rate": 4.883526436250441e-06, + "loss": 0.7339, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.84452748298645, + "learning_rate": 4.8820684174394935e-06, + "loss": 0.7808, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 3.604048490524292, + "learning_rate": 4.880601549945622e-06, + "loss": 0.96, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 2.302924871444702, + "learning_rate": 4.879125839217808e-06, + "loss": 0.8122, + "step": 163 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 3.1254405975341797, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.7307, + "step": 164 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 2.745603322982788, + "learning_rate": 4.8761479100205085e-06, + "loss": 0.7554, + "step": 165 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 2.494840145111084, + "learning_rate": 4.874645702613152e-06, + "loss": 0.4372, + "step": 166 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 2.3526735305786133, + "learning_rate": 4.873134674096072e-06, + "loss": 0.3597, + "step": 167 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 2.945887804031372, + "learning_rate": 4.871614830082297e-06, + "loss": 0.5854, + "step": 168 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 3.5723934173583984, + "learning_rate": 4.870086176217597e-06, + "loss": 0.7978, + "step": 169 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 3.2997145652770996, + "learning_rate": 4.868548718180473e-06, + "loss": 0.5593, + "step": 170 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 3.4120635986328125, + "learning_rate": 4.867002461682129e-06, + "loss": 0.4083, + "step": 171 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 2.697617292404175, + "learning_rate": 4.8654474124664505e-06, + "loss": 0.4752, + "step": 172 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 5.082247734069824, + "learning_rate": 4.863883576309991e-06, + "loss": 0.7435, + "step": 173 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 2.773864984512329, + "learning_rate": 4.8623109590219395e-06, + "loss": 0.4612, + "step": 174 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 3.429703712463379, + "learning_rate": 4.860729566444106e-06, + "loss": 0.4644, + "step": 175 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 2.997938394546509, + "learning_rate": 4.8591394044508985e-06, + "loss": 0.4852, + "step": 176 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 2.549513339996338, + "learning_rate": 4.857540478949302e-06, + "loss": 0.4574, + "step": 177 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 3.459400177001953, + "learning_rate": 4.855932795878852e-06, + "loss": 0.8095, + "step": 178 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 2.8103644847869873, + "learning_rate": 4.854316361211619e-06, + "loss": 0.4578, + "step": 179 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 2.631221055984497, + "learning_rate": 4.852691180952183e-06, + "loss": 0.5473, + "step": 180 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 3.189946174621582, + "learning_rate": 4.851057261137608e-06, + "loss": 0.4313, + "step": 181 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 2.891418933868408, + "learning_rate": 4.8494146078374274e-06, + "loss": 0.4197, + "step": 182 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 3.239637613296509, + "learning_rate": 4.847763227153612e-06, + "loss": 0.5865, + "step": 183 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 2.484644651412964, + "learning_rate": 4.846103125220557e-06, + "loss": 0.3866, + "step": 184 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 3.1045992374420166, + "learning_rate": 4.844434308205052e-06, + "loss": 0.5357, + "step": 185 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 2.648472309112549, + "learning_rate": 4.842756782306261e-06, + "loss": 0.4783, + "step": 186 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 2.5685644149780273, + "learning_rate": 4.841070553755697e-06, + "loss": 0.3733, + "step": 187 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 3.7727200984954834, + "learning_rate": 4.839375628817205e-06, + "loss": 0.6039, + "step": 188 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 2.8237369060516357, + "learning_rate": 4.837672013786931e-06, + "loss": 0.5372, + "step": 189 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 3.0312252044677734, + "learning_rate": 4.835959714993305e-06, + "loss": 0.5162, + "step": 190 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 2.821498394012451, + "learning_rate": 4.8342387387970105e-06, + "loss": 0.4537, + "step": 191 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 2.7834129333496094, + "learning_rate": 4.832509091590968e-06, + "loss": 0.6165, + "step": 192 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 2.9274091720581055, + "learning_rate": 4.830770779800309e-06, + "loss": 0.7475, + "step": 193 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 2.813945770263672, + "learning_rate": 4.829023809882349e-06, + "loss": 0.4629, + "step": 194 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 2.27876877784729, + "learning_rate": 4.827268188326567e-06, + "loss": 0.5208, + "step": 195 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 2.8444204330444336, + "learning_rate": 4.825503921654582e-06, + "loss": 0.6521, + "step": 196 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 3.3730578422546387, + "learning_rate": 4.823731016420122e-06, + "loss": 0.7491, + "step": 197 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 2.9717822074890137, + "learning_rate": 4.821949479209011e-06, + "loss": 0.3866, + "step": 198 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 2.6570653915405273, + "learning_rate": 4.820159316639133e-06, + "loss": 0.499, + "step": 199 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 2.819960117340088, + "learning_rate": 4.818360535360418e-06, + "loss": 0.556, + "step": 200 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 2.7912111282348633, + "learning_rate": 4.816553142054806e-06, + "loss": 0.3433, + "step": 201 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 2.6427981853485107, + "learning_rate": 4.814737143436232e-06, + "loss": 0.8808, + "step": 202 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 2.5917580127716064, + "learning_rate": 4.812912546250595e-06, + "loss": 0.5718, + "step": 203 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 3.770759344100952, + "learning_rate": 4.81107935727574e-06, + "loss": 0.9743, + "step": 204 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 2.558248996734619, + "learning_rate": 4.809237583321421e-06, + "loss": 0.2821, + "step": 205 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 2.692087173461914, + "learning_rate": 4.807387231229287e-06, + "loss": 0.7524, + "step": 206 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 2.661738157272339, + "learning_rate": 4.8055283078728525e-06, + "loss": 0.4304, + "step": 207 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 2.9232122898101807, + "learning_rate": 4.803660820157468e-06, + "loss": 0.6986, + "step": 208 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 2.665097951889038, + "learning_rate": 4.801784775020303e-06, + "loss": 0.7112, + "step": 209 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 2.4504497051239014, + "learning_rate": 4.799900179430312e-06, + "loss": 0.4125, + "step": 210 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 3.076204538345337, + "learning_rate": 4.798007040388212e-06, + "loss": 0.7057, + "step": 211 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 2.406977653503418, + "learning_rate": 4.7961053649264585e-06, + "loss": 0.708, + "step": 212 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 2.6545324325561523, + "learning_rate": 4.794195160109215e-06, + "loss": 0.7608, + "step": 213 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 4.3817033767700195, + "learning_rate": 4.7922764330323315e-06, + "loss": 0.4779, + "step": 214 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 3.534566879272461, + "learning_rate": 4.790349190823313e-06, + "loss": 0.5464, + "step": 215 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 3.0323140621185303, + "learning_rate": 4.788413440641297e-06, + "loss": 0.6198, + "step": 216 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 2.612746238708496, + "learning_rate": 4.786469189677026e-06, + "loss": 0.6695, + "step": 217 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 3.0299434661865234, + "learning_rate": 4.784516445152821e-06, + "loss": 0.4902, + "step": 218 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 3.4521942138671875, + "learning_rate": 4.78255521432255e-06, + "loss": 0.7411, + "step": 219 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 2.6712653636932373, + "learning_rate": 4.780585504471612e-06, + "loss": 0.8767, + "step": 220 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 2.5099475383758545, + "learning_rate": 4.778607322916896e-06, + "loss": 0.4266, + "step": 221 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 2.641799211502075, + "learning_rate": 4.776620677006766e-06, + "loss": 0.4982, + "step": 222 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 3.1119771003723145, + "learning_rate": 4.7746255741210256e-06, + "loss": 0.6012, + "step": 223 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 3.9957170486450195, + "learning_rate": 4.772622021670897e-06, + "loss": 0.7585, + "step": 224 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 3.1070823669433594, + "learning_rate": 4.770610027098983e-06, + "loss": 0.5266, + "step": 225 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 2.7630460262298584, + "learning_rate": 4.7685895978792564e-06, + "loss": 0.6261, + "step": 226 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 2.6509556770324707, + "learning_rate": 4.766560741517014e-06, + "loss": 0.7081, + "step": 227 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 3.0212976932525635, + "learning_rate": 4.76452346554886e-06, + "loss": 0.5041, + "step": 228 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 3.0454728603363037, + "learning_rate": 4.762477777542676e-06, + "loss": 0.49, + "step": 229 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 3.4296791553497314, + "learning_rate": 4.7604236850975905e-06, + "loss": 0.7056, + "step": 230 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 4.1885600090026855, + "learning_rate": 4.7583611958439514e-06, + "loss": 0.7762, + "step": 231 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 3.065854072570801, + "learning_rate": 4.7562903174433e-06, + "loss": 0.5347, + "step": 232 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 2.793851852416992, + "learning_rate": 4.75421105758834e-06, + "loss": 0.503, + "step": 233 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 3.123730421066284, + "learning_rate": 4.752123424002908e-06, + "loss": 0.5081, + "step": 234 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 3.230161666870117, + "learning_rate": 4.750027424441949e-06, + "loss": 0.7523, + "step": 235 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 2.4970247745513916, + "learning_rate": 4.747923066691487e-06, + "loss": 0.5575, + "step": 236 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 2.9880685806274414, + "learning_rate": 4.745810358568588e-06, + "loss": 0.7264, + "step": 237 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 2.555328369140625, + "learning_rate": 4.743689307921342e-06, + "loss": 0.4545, + "step": 238 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 3.144932746887207, + "learning_rate": 4.741559922628828e-06, + "loss": 0.5429, + "step": 239 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 3.059807062149048, + "learning_rate": 4.739422210601085e-06, + "loss": 0.5086, + "step": 240 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 3.374303102493286, + "learning_rate": 4.7372761797790836e-06, + "loss": 0.6109, + "step": 241 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 2.4506947994232178, + "learning_rate": 4.735121838134697e-06, + "loss": 0.4317, + "step": 242 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 2.9039974212646484, + "learning_rate": 4.732959193670672e-06, + "loss": 0.6414, + "step": 243 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 2.9412453174591064, + "learning_rate": 4.730788254420593e-06, + "loss": 0.5166, + "step": 244 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 2.500716209411621, + "learning_rate": 4.728609028448862e-06, + "loss": 0.4982, + "step": 245 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 2.4233803749084473, + "learning_rate": 4.726421523850662e-06, + "loss": 0.7552, + "step": 246 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 2.357003688812256, + "learning_rate": 4.7242257487519275e-06, + "loss": 0.4365, + "step": 247 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 2.6406495571136475, + "learning_rate": 4.722021711309317e-06, + "loss": 0.6002, + "step": 248 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 2.736884832382202, + "learning_rate": 4.7198094197101826e-06, + "loss": 0.4993, + "step": 249 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 3.5238845348358154, + "learning_rate": 4.7175888821725335e-06, + "loss": 0.4637, + "step": 250 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 3.3783695697784424, + "learning_rate": 4.715360106945015e-06, + "loss": 0.9711, + "step": 251 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 2.9685862064361572, + "learning_rate": 4.713123102306869e-06, + "loss": 0.5452, + "step": 252 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 3.143733263015747, + "learning_rate": 4.710877876567912e-06, + "loss": 0.5034, + "step": 253 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 2.8005623817443848, + "learning_rate": 4.708624438068494e-06, + "loss": 0.4236, + "step": 254 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 2.66581130027771, + "learning_rate": 4.706362795179476e-06, + "loss": 0.6095, + "step": 255 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 4.598043441772461, + "learning_rate": 4.7040929563021975e-06, + "loss": 0.738, + "step": 256 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 3.5643506050109863, + "learning_rate": 4.70181492986844e-06, + "loss": 0.6726, + "step": 257 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 2.865339994430542, + "learning_rate": 4.699528724340401e-06, + "loss": 0.4862, + "step": 258 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 2.95529842376709, + "learning_rate": 4.6972343482106615e-06, + "loss": 0.5003, + "step": 259 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 2.45206356048584, + "learning_rate": 4.6949318100021546e-06, + "loss": 0.6734, + "step": 260 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 2.6789939403533936, + "learning_rate": 4.6926211182681295e-06, + "loss": 0.5639, + "step": 261 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 3.307732582092285, + "learning_rate": 4.690302281592128e-06, + "loss": 0.7032, + "step": 262 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 2.8950445652008057, + "learning_rate": 4.687975308587944e-06, + "loss": 0.4937, + "step": 263 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 2.969377040863037, + "learning_rate": 4.685640207899598e-06, + "loss": 0.5829, + "step": 264 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 3.106433391571045, + "learning_rate": 4.683296988201301e-06, + "loss": 0.3805, + "step": 265 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 3.5599050521850586, + "learning_rate": 4.680945658197425e-06, + "loss": 0.7939, + "step": 266 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 5.008603096008301, + "learning_rate": 4.6785862266224695e-06, + "loss": 0.7511, + "step": 267 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 3.1393773555755615, + "learning_rate": 4.676218702241026e-06, + "loss": 0.8984, + "step": 268 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 3.0241408348083496, + "learning_rate": 4.673843093847753e-06, + "loss": 0.5473, + "step": 269 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 2.9029417037963867, + "learning_rate": 4.6714594102673355e-06, + "loss": 0.6626, + "step": 270 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 3.4709246158599854, + "learning_rate": 4.669067660354456e-06, + "loss": 0.5015, + "step": 271 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 2.988635778427124, + "learning_rate": 4.666667852993761e-06, + "loss": 0.5384, + "step": 272 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 3.418140411376953, + "learning_rate": 4.664259997099829e-06, + "loss": 0.7491, + "step": 273 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 2.592416763305664, + "learning_rate": 4.661844101617135e-06, + "loss": 0.6451, + "step": 274 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 3.1174306869506836, + "learning_rate": 4.6594201755200205e-06, + "loss": 0.6299, + "step": 275 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 2.6569998264312744, + "learning_rate": 4.656988227812658e-06, + "loss": 0.4477, + "step": 276 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 3.5733959674835205, + "learning_rate": 4.654548267529015e-06, + "loss": 0.5473, + "step": 277 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 2.7240824699401855, + "learning_rate": 4.652100303732827e-06, + "loss": 0.496, + "step": 278 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 4.1965460777282715, + "learning_rate": 4.64964434551756e-06, + "loss": 0.932, + "step": 279 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 2.3237173557281494, + "learning_rate": 4.647180402006372e-06, + "loss": 0.4648, + "step": 280 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 3.395045042037964, + "learning_rate": 4.644708482352093e-06, + "loss": 0.7237, + "step": 281 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 3.238593816757202, + "learning_rate": 4.6422285957371735e-06, + "loss": 0.5531, + "step": 282 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 3.9651403427124023, + "learning_rate": 4.639740751373663e-06, + "loss": 0.6706, + "step": 283 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 3.0042061805725098, + "learning_rate": 4.63724495850317e-06, + "loss": 0.56, + "step": 284 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 3.094310760498047, + "learning_rate": 4.634741226396832e-06, + "loss": 0.6138, + "step": 285 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 2.838168144226074, + "learning_rate": 4.632229564355275e-06, + "loss": 0.4908, + "step": 286 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 3.3452796936035156, + "learning_rate": 4.629709981708586e-06, + "loss": 0.8181, + "step": 287 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 2.6630783081054688, + "learning_rate": 4.6271824878162704e-06, + "loss": 0.5625, + "step": 288 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 2.583650588989258, + "learning_rate": 4.624647092067226e-06, + "loss": 0.3416, + "step": 289 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 2.73132586479187, + "learning_rate": 4.622103803879702e-06, + "loss": 0.3889, + "step": 290 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 4.1010260581970215, + "learning_rate": 4.619552632701263e-06, + "loss": 0.611, + "step": 291 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 4.53068208694458, + "learning_rate": 4.61699358800876e-06, + "loss": 0.7219, + "step": 292 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 3.4877254962921143, + "learning_rate": 4.614426679308291e-06, + "loss": 0.6402, + "step": 293 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 2.9445226192474365, + "learning_rate": 4.611851916135166e-06, + "loss": 0.509, + "step": 294 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 2.6622228622436523, + "learning_rate": 4.609269308053872e-06, + "loss": 0.6167, + "step": 295 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 3.131530523300171, + "learning_rate": 4.606678864658039e-06, + "loss": 0.8039, + "step": 296 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 3.212188482284546, + "learning_rate": 4.604080595570399e-06, + "loss": 0.5754, + "step": 297 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 3.522850275039673, + "learning_rate": 4.601474510442759e-06, + "loss": 0.4432, + "step": 298 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 2.5877151489257812, + "learning_rate": 4.598860618955957e-06, + "loss": 0.6541, + "step": 299 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 2.803833484649658, + "learning_rate": 4.596238930819832e-06, + "loss": 0.5824, + "step": 300 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 2.7125494480133057, + "learning_rate": 4.5936094557731815e-06, + "loss": 0.6976, + "step": 301 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 3.6549370288848877, + "learning_rate": 4.590972203583732e-06, + "loss": 0.7105, + "step": 302 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 3.3241944313049316, + "learning_rate": 4.588327184048099e-06, + "loss": 0.7446, + "step": 303 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 2.8388822078704834, + "learning_rate": 4.585674406991752e-06, + "loss": 0.4926, + "step": 304 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 2.9760420322418213, + "learning_rate": 4.5830138822689755e-06, + "loss": 0.7368, + "step": 305 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 2.5437633991241455, + "learning_rate": 4.5803456197628374e-06, + "loss": 0.4678, + "step": 306 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 3.0044775009155273, + "learning_rate": 4.577669629385145e-06, + "loss": 0.4241, + "step": 307 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 2.6150901317596436, + "learning_rate": 4.574985921076418e-06, + "loss": 0.5327, + "step": 308 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 2.4425182342529297, + "learning_rate": 4.572294504805841e-06, + "loss": 0.7504, + "step": 309 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 2.9920194149017334, + "learning_rate": 4.569595390571232e-06, + "loss": 0.5194, + "step": 310 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 2.701087713241577, + "learning_rate": 4.566888588399007e-06, + "loss": 0.6862, + "step": 311 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 7.628893852233887, + "learning_rate": 4.564174108344139e-06, + "loss": 0.6867, + "step": 312 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 2.712947130203247, + "learning_rate": 4.561451960490123e-06, + "loss": 0.6942, + "step": 313 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 3.0063202381134033, + "learning_rate": 4.558722154948937e-06, + "loss": 0.6346, + "step": 314 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 2.957218647003174, + "learning_rate": 4.5559847018610034e-06, + "loss": 0.464, + "step": 315 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 3.322282552719116, + "learning_rate": 4.553239611395156e-06, + "loss": 0.6334, + "step": 316 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 3.0638647079467773, + "learning_rate": 4.550486893748596e-06, + "loss": 0.4227, + "step": 317 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 3.079087257385254, + "learning_rate": 4.547726559146862e-06, + "loss": 0.3719, + "step": 318 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 2.409914255142212, + "learning_rate": 4.544958617843782e-06, + "loss": 0.3331, + "step": 319 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 3.3441262245178223, + "learning_rate": 4.542183080121444e-06, + "loss": 0.6931, + "step": 320 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 2.6624436378479004, + "learning_rate": 4.539399956290152e-06, + "loss": 0.6578, + "step": 321 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 3.463789224624634, + "learning_rate": 4.536609256688396e-06, + "loss": 0.5748, + "step": 322 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 3.6827807426452637, + "learning_rate": 4.533810991682799e-06, + "loss": 0.5249, + "step": 323 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 4.125547409057617, + "learning_rate": 4.531005171668093e-06, + "loss": 0.3065, + "step": 324 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 2.935978412628174, + "learning_rate": 4.528191807067074e-06, + "loss": 0.5523, + "step": 325 + }, + { + "epoch": 2.0, + "grad_norm": 2.654388427734375, + "learning_rate": 4.525370908330564e-06, + "loss": 0.4157, + "step": 326 + }, + { + "epoch": 2.0061349693251533, + "grad_norm": 3.213925838470459, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4243, + "step": 327 + }, + { + "epoch": 2.0122699386503067, + "grad_norm": 3.5483286380767822, + "learning_rate": 4.519706550394248e-06, + "loss": 0.4137, + "step": 328 + }, + { + "epoch": 2.01840490797546, + "grad_norm": 3.32084059715271, + "learning_rate": 4.516863112235864e-06, + "loss": 0.5389, + "step": 329 + }, + { + "epoch": 2.0245398773006134, + "grad_norm": 3.427666425704956, + "learning_rate": 4.514012182024756e-06, + "loss": 0.285, + "step": 330 + }, + { + "epoch": 2.0306748466257667, + "grad_norm": 3.3269975185394287, + "learning_rate": 4.511153770351288e-06, + "loss": 0.4877, + "step": 331 + }, + { + "epoch": 2.03680981595092, + "grad_norm": 5.258850574493408, + "learning_rate": 4.508287887833619e-06, + "loss": 0.5168, + "step": 332 + }, + { + "epoch": 2.042944785276074, + "grad_norm": 4.316092491149902, + "learning_rate": 4.505414545117658e-06, + "loss": 0.4791, + "step": 333 + }, + { + "epoch": 2.049079754601227, + "grad_norm": 3.952056884765625, + "learning_rate": 4.502533752877028e-06, + "loss": 0.3014, + "step": 334 + }, + { + "epoch": 2.0552147239263805, + "grad_norm": 4.0617194175720215, + "learning_rate": 4.499645521813024e-06, + "loss": 0.4313, + "step": 335 + }, + { + "epoch": 2.061349693251534, + "grad_norm": 3.7869274616241455, + "learning_rate": 4.496749862654574e-06, + "loss": 0.4807, + "step": 336 + }, + { + "epoch": 2.067484662576687, + "grad_norm": 3.8181991577148438, + "learning_rate": 4.4938467861582e-06, + "loss": 0.4002, + "step": 337 + }, + { + "epoch": 2.0736196319018405, + "grad_norm": 3.8289854526519775, + "learning_rate": 4.490936303107975e-06, + "loss": 0.618, + "step": 338 + }, + { + "epoch": 2.079754601226994, + "grad_norm": 3.121443271636963, + "learning_rate": 4.488018424315488e-06, + "loss": 0.4203, + "step": 339 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 3.141782283782959, + "learning_rate": 4.4850931606198e-06, + "loss": 0.3618, + "step": 340 + }, + { + "epoch": 2.0920245398773005, + "grad_norm": 3.1279287338256836, + "learning_rate": 4.482160522887404e-06, + "loss": 0.4571, + "step": 341 + }, + { + "epoch": 2.098159509202454, + "grad_norm": 3.2418482303619385, + "learning_rate": 4.479220522012185e-06, + "loss": 0.2674, + "step": 342 + }, + { + "epoch": 2.104294478527607, + "grad_norm": 10.230683326721191, + "learning_rate": 4.476273168915382e-06, + "loss": 0.5479, + "step": 343 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 3.588361978530884, + "learning_rate": 4.473318474545544e-06, + "loss": 0.3654, + "step": 344 + }, + { + "epoch": 2.116564417177914, + "grad_norm": 3.0913164615631104, + "learning_rate": 4.470356449878489e-06, + "loss": 0.2704, + "step": 345 + }, + { + "epoch": 2.1226993865030677, + "grad_norm": 3.972447633743286, + "learning_rate": 4.467387105917269e-06, + "loss": 0.3029, + "step": 346 + }, + { + "epoch": 2.128834355828221, + "grad_norm": 3.7174713611602783, + "learning_rate": 4.464410453692122e-06, + "loss": 0.6536, + "step": 347 + }, + { + "epoch": 2.1349693251533743, + "grad_norm": 3.9333994388580322, + "learning_rate": 4.461426504260434e-06, + "loss": 0.3806, + "step": 348 + }, + { + "epoch": 2.1411042944785277, + "grad_norm": 4.752816200256348, + "learning_rate": 4.458435268706699e-06, + "loss": 0.4019, + "step": 349 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 2.505603790283203, + "learning_rate": 4.455436758142477e-06, + "loss": 0.2348, + "step": 350 + }, + { + "epoch": 2.1533742331288344, + "grad_norm": 3.3050570487976074, + "learning_rate": 4.452430983706351e-06, + "loss": 0.505, + "step": 351 + }, + { + "epoch": 2.1595092024539877, + "grad_norm": 5.387442588806152, + "learning_rate": 4.44941795656389e-06, + "loss": 0.399, + "step": 352 + }, + { + "epoch": 2.165644171779141, + "grad_norm": 3.4759480953216553, + "learning_rate": 4.446397687907601e-06, + "loss": 0.5664, + "step": 353 + }, + { + "epoch": 2.1717791411042944, + "grad_norm": 2.949445962905884, + "learning_rate": 4.4433701889568935e-06, + "loss": 0.2128, + "step": 354 + }, + { + "epoch": 2.1779141104294477, + "grad_norm": 3.2884252071380615, + "learning_rate": 4.440335470958035e-06, + "loss": 0.3138, + "step": 355 + }, + { + "epoch": 2.184049079754601, + "grad_norm": 3.1605632305145264, + "learning_rate": 4.437293545184111e-06, + "loss": 0.349, + "step": 356 + }, + { + "epoch": 2.190184049079755, + "grad_norm": 2.9996821880340576, + "learning_rate": 4.434244422934976e-06, + "loss": 0.343, + "step": 357 + }, + { + "epoch": 2.196319018404908, + "grad_norm": 3.6373324394226074, + "learning_rate": 4.431188115537226e-06, + "loss": 0.5656, + "step": 358 + }, + { + "epoch": 2.2024539877300615, + "grad_norm": 4.667621612548828, + "learning_rate": 4.428124634344141e-06, + "loss": 0.2335, + "step": 359 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 3.815484046936035, + "learning_rate": 4.425053990735653e-06, + "loss": 0.2173, + "step": 360 + }, + { + "epoch": 2.214723926380368, + "grad_norm": 4.689478874206543, + "learning_rate": 4.421976196118297e-06, + "loss": 0.5071, + "step": 361 + }, + { + "epoch": 2.2208588957055215, + "grad_norm": 4.016942024230957, + "learning_rate": 4.4188912619251765e-06, + "loss": 0.384, + "step": 362 + }, + { + "epoch": 2.226993865030675, + "grad_norm": 3.5336828231811523, + "learning_rate": 4.415799199615912e-06, + "loss": 0.3133, + "step": 363 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 2.9195592403411865, + "learning_rate": 4.4127000206766055e-06, + "loss": 0.3847, + "step": 364 + }, + { + "epoch": 2.2392638036809815, + "grad_norm": 2.6843531131744385, + "learning_rate": 4.409593736619795e-06, + "loss": 0.3539, + "step": 365 + }, + { + "epoch": 2.245398773006135, + "grad_norm": 2.8692703247070312, + "learning_rate": 4.40648035898441e-06, + "loss": 0.3664, + "step": 366 + }, + { + "epoch": 2.2515337423312882, + "grad_norm": 2.820422649383545, + "learning_rate": 4.403359899335732e-06, + "loss": 0.4606, + "step": 367 + }, + { + "epoch": 2.2576687116564416, + "grad_norm": 3.8641669750213623, + "learning_rate": 4.400232369265351e-06, + "loss": 0.2931, + "step": 368 + }, + { + "epoch": 2.263803680981595, + "grad_norm": 2.75347638130188, + "learning_rate": 4.39709778039112e-06, + "loss": 0.3393, + "step": 369 + }, + { + "epoch": 2.2699386503067487, + "grad_norm": 15.150428771972656, + "learning_rate": 4.393956144357113e-06, + "loss": 0.65, + "step": 370 + }, + { + "epoch": 2.276073619631902, + "grad_norm": 2.4876065254211426, + "learning_rate": 4.390807472833585e-06, + "loss": 0.372, + "step": 371 + }, + { + "epoch": 2.2822085889570554, + "grad_norm": 2.7328054904937744, + "learning_rate": 4.3876517775169216e-06, + "loss": 0.2802, + "step": 372 + }, + { + "epoch": 2.2883435582822087, + "grad_norm": 2.903221368789673, + "learning_rate": 4.384489070129604e-06, + "loss": 0.1964, + "step": 373 + }, + { + "epoch": 2.294478527607362, + "grad_norm": 3.9368724822998047, + "learning_rate": 4.381319362420158e-06, + "loss": 0.4272, + "step": 374 + }, + { + "epoch": 2.3006134969325154, + "grad_norm": 5.431981086730957, + "learning_rate": 4.378142666163114e-06, + "loss": 0.4513, + "step": 375 + }, + { + "epoch": 2.3067484662576687, + "grad_norm": 3.661733627319336, + "learning_rate": 4.374958993158965e-06, + "loss": 0.6087, + "step": 376 + }, + { + "epoch": 2.312883435582822, + "grad_norm": 3.004450559616089, + "learning_rate": 4.371768355234116e-06, + "loss": 0.2206, + "step": 377 + }, + { + "epoch": 2.3190184049079754, + "grad_norm": 4.3785576820373535, + "learning_rate": 4.368570764240852e-06, + "loss": 0.6055, + "step": 378 + }, + { + "epoch": 2.3251533742331287, + "grad_norm": 3.4699394702911377, + "learning_rate": 4.365366232057279e-06, + "loss": 0.6286, + "step": 379 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 2.6862998008728027, + "learning_rate": 4.3621547705872915e-06, + "loss": 0.2622, + "step": 380 + }, + { + "epoch": 2.3374233128834354, + "grad_norm": 3.056382179260254, + "learning_rate": 4.358936391760524e-06, + "loss": 0.3439, + "step": 381 + }, + { + "epoch": 2.3435582822085887, + "grad_norm": 2.6211307048797607, + "learning_rate": 4.355711107532305e-06, + "loss": 0.3677, + "step": 382 + }, + { + "epoch": 2.3496932515337425, + "grad_norm": 2.682060956954956, + "learning_rate": 4.3524789298836175e-06, + "loss": 0.3068, + "step": 383 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 3.482539415359497, + "learning_rate": 4.349239870821049e-06, + "loss": 0.3737, + "step": 384 + }, + { + "epoch": 2.361963190184049, + "grad_norm": 2.8645472526550293, + "learning_rate": 4.345993942376752e-06, + "loss": 0.2837, + "step": 385 + }, + { + "epoch": 2.3680981595092025, + "grad_norm": 3.6142354011535645, + "learning_rate": 4.342741156608392e-06, + "loss": 0.6162, + "step": 386 + }, + { + "epoch": 2.374233128834356, + "grad_norm": 3.0748162269592285, + "learning_rate": 4.3394815255991135e-06, + "loss": 0.2986, + "step": 387 + }, + { + "epoch": 2.3803680981595092, + "grad_norm": 5.090906620025635, + "learning_rate": 4.336215061457485e-06, + "loss": 0.5383, + "step": 388 + }, + { + "epoch": 2.3865030674846626, + "grad_norm": 3.9235823154449463, + "learning_rate": 4.332941776317458e-06, + "loss": 0.4179, + "step": 389 + }, + { + "epoch": 2.392638036809816, + "grad_norm": 3.482926368713379, + "learning_rate": 4.329661682338325e-06, + "loss": 0.3938, + "step": 390 + }, + { + "epoch": 2.3987730061349692, + "grad_norm": 4.274583339691162, + "learning_rate": 4.32637479170467e-06, + "loss": 0.3349, + "step": 391 + }, + { + "epoch": 2.4049079754601226, + "grad_norm": 3.326012372970581, + "learning_rate": 4.323081116626322e-06, + "loss": 0.3336, + "step": 392 + }, + { + "epoch": 2.411042944785276, + "grad_norm": 3.174591541290283, + "learning_rate": 4.319780669338316e-06, + "loss": 0.2983, + "step": 393 + }, + { + "epoch": 2.4171779141104293, + "grad_norm": 3.9073634147644043, + "learning_rate": 4.31647346210084e-06, + "loss": 0.8401, + "step": 394 + }, + { + "epoch": 2.4233128834355826, + "grad_norm": 3.4787721633911133, + "learning_rate": 4.313159507199197e-06, + "loss": 0.2583, + "step": 395 + }, + { + "epoch": 2.4294478527607364, + "grad_norm": 3.19903564453125, + "learning_rate": 4.309838816943755e-06, + "loss": 0.2861, + "step": 396 + }, + { + "epoch": 2.4355828220858897, + "grad_norm": 3.184246778488159, + "learning_rate": 4.306511403669897e-06, + "loss": 0.2956, + "step": 397 + }, + { + "epoch": 2.441717791411043, + "grad_norm": 3.8991878032684326, + "learning_rate": 4.303177279737988e-06, + "loss": 0.5378, + "step": 398 + }, + { + "epoch": 2.4478527607361964, + "grad_norm": 3.411949872970581, + "learning_rate": 4.299836457533313e-06, + "loss": 0.3423, + "step": 399 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 3.445502996444702, + "learning_rate": 4.296488949466046e-06, + "loss": 0.5608, + "step": 400 + }, + { + "epoch": 2.460122699386503, + "grad_norm": 3.066798210144043, + "learning_rate": 4.293134767971193e-06, + "loss": 0.3214, + "step": 401 + }, + { + "epoch": 2.4662576687116564, + "grad_norm": 3.0581583976745605, + "learning_rate": 4.28977392550855e-06, + "loss": 0.5117, + "step": 402 + }, + { + "epoch": 2.4723926380368098, + "grad_norm": 4.207413673400879, + "learning_rate": 4.286406434562659e-06, + "loss": 0.2666, + "step": 403 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 2.9934990406036377, + "learning_rate": 4.283032307642756e-06, + "loss": 0.2878, + "step": 404 + }, + { + "epoch": 2.4846625766871164, + "grad_norm": 3.800593614578247, + "learning_rate": 4.2796515572827305e-06, + "loss": 0.2619, + "step": 405 + }, + { + "epoch": 2.4907975460122698, + "grad_norm": 3.2029523849487305, + "learning_rate": 4.276264196041074e-06, + "loss": 0.1735, + "step": 406 + }, + { + "epoch": 2.4969325153374236, + "grad_norm": 3.515634059906006, + "learning_rate": 4.2728702365008356e-06, + "loss": 0.4741, + "step": 407 + }, + { + "epoch": 2.5030674846625764, + "grad_norm": 3.8354873657226562, + "learning_rate": 4.269469691269577e-06, + "loss": 0.3713, + "step": 408 + }, + { + "epoch": 2.5092024539877302, + "grad_norm": 3.902904510498047, + "learning_rate": 4.266062572979323e-06, + "loss": 0.5189, + "step": 409 + }, + { + "epoch": 2.5153374233128836, + "grad_norm": 3.3276097774505615, + "learning_rate": 4.262648894286515e-06, + "loss": 0.2461, + "step": 410 + }, + { + "epoch": 2.521472392638037, + "grad_norm": 2.9457011222839355, + "learning_rate": 4.259228667871963e-06, + "loss": 0.3013, + "step": 411 + }, + { + "epoch": 2.5276073619631902, + "grad_norm": 2.8941617012023926, + "learning_rate": 4.255801906440803e-06, + "loss": 0.2784, + "step": 412 + }, + { + "epoch": 2.5337423312883436, + "grad_norm": 2.949399471282959, + "learning_rate": 4.252368622722443e-06, + "loss": 0.457, + "step": 413 + }, + { + "epoch": 2.539877300613497, + "grad_norm": 3.342108726501465, + "learning_rate": 4.248928829470522e-06, + "loss": 0.487, + "step": 414 + }, + { + "epoch": 2.5460122699386503, + "grad_norm": 3.9556386470794678, + "learning_rate": 4.245482539462861e-06, + "loss": 0.6118, + "step": 415 + }, + { + "epoch": 2.5521472392638036, + "grad_norm": 3.6936280727386475, + "learning_rate": 4.242029765501411e-06, + "loss": 0.6131, + "step": 416 + }, + { + "epoch": 2.558282208588957, + "grad_norm": 2.79897403717041, + "learning_rate": 4.2385705204122104e-06, + "loss": 0.4209, + "step": 417 + }, + { + "epoch": 2.5644171779141103, + "grad_norm": 4.093318462371826, + "learning_rate": 4.235104817045338e-06, + "loss": 0.5375, + "step": 418 + }, + { + "epoch": 2.5705521472392636, + "grad_norm": 3.138263463973999, + "learning_rate": 4.231632668274861e-06, + "loss": 0.4682, + "step": 419 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 3.1465651988983154, + "learning_rate": 4.22815408699879e-06, + "loss": 0.2522, + "step": 420 + }, + { + "epoch": 2.5828220858895703, + "grad_norm": 3.5166101455688477, + "learning_rate": 4.22466908613903e-06, + "loss": 0.4776, + "step": 421 + }, + { + "epoch": 2.588957055214724, + "grad_norm": 2.8498189449310303, + "learning_rate": 4.221177678641333e-06, + "loss": 0.3067, + "step": 422 + }, + { + "epoch": 2.5950920245398774, + "grad_norm": 2.8046035766601562, + "learning_rate": 4.217679877475251e-06, + "loss": 0.2402, + "step": 423 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 4.204788684844971, + "learning_rate": 4.214175695634084e-06, + "loss": 0.2608, + "step": 424 + }, + { + "epoch": 2.607361963190184, + "grad_norm": 2.5569400787353516, + "learning_rate": 4.210665146134838e-06, + "loss": 0.2801, + "step": 425 + }, + { + "epoch": 2.6134969325153374, + "grad_norm": 3.5359091758728027, + "learning_rate": 4.20714824201817e-06, + "loss": 0.2027, + "step": 426 + }, + { + "epoch": 2.6196319018404908, + "grad_norm": 3.5132668018341064, + "learning_rate": 4.203624996348343e-06, + "loss": 0.4253, + "step": 427 + }, + { + "epoch": 2.625766871165644, + "grad_norm": 3.5076472759246826, + "learning_rate": 4.200095422213177e-06, + "loss": 0.3014, + "step": 428 + }, + { + "epoch": 2.6319018404907975, + "grad_norm": 3.6501238346099854, + "learning_rate": 4.196559532724004e-06, + "loss": 0.6526, + "step": 429 + }, + { + "epoch": 2.638036809815951, + "grad_norm": 2.849924325942993, + "learning_rate": 4.193017341015608e-06, + "loss": 0.4487, + "step": 430 + }, + { + "epoch": 2.644171779141104, + "grad_norm": 3.2228448390960693, + "learning_rate": 4.189468860246192e-06, + "loss": 0.5386, + "step": 431 + }, + { + "epoch": 2.6503067484662575, + "grad_norm": 2.532102108001709, + "learning_rate": 4.185914103597316e-06, + "loss": 0.3034, + "step": 432 + }, + { + "epoch": 2.6564417177914113, + "grad_norm": 2.862720251083374, + "learning_rate": 4.182353084273855e-06, + "loss": 0.5862, + "step": 433 + }, + { + "epoch": 2.662576687116564, + "grad_norm": 3.4617464542388916, + "learning_rate": 4.178785815503946e-06, + "loss": 0.3954, + "step": 434 + }, + { + "epoch": 2.668711656441718, + "grad_norm": 2.627758741378784, + "learning_rate": 4.1752123105389444e-06, + "loss": 0.4367, + "step": 435 + }, + { + "epoch": 2.6748466257668713, + "grad_norm": 3.2868380546569824, + "learning_rate": 4.171632582653368e-06, + "loss": 0.2997, + "step": 436 + }, + { + "epoch": 2.6809815950920246, + "grad_norm": 3.4260897636413574, + "learning_rate": 4.168046645144851e-06, + "loss": 0.3354, + "step": 437 + }, + { + "epoch": 2.687116564417178, + "grad_norm": 3.1415748596191406, + "learning_rate": 4.164454511334098e-06, + "loss": 0.5538, + "step": 438 + }, + { + "epoch": 2.6932515337423313, + "grad_norm": 3.3700919151306152, + "learning_rate": 4.160856194564828e-06, + "loss": 0.5731, + "step": 439 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 3.146968364715576, + "learning_rate": 4.157251708203728e-06, + "loss": 0.4429, + "step": 440 + }, + { + "epoch": 2.705521472392638, + "grad_norm": 3.7495830059051514, + "learning_rate": 4.153641065640402e-06, + "loss": 0.6361, + "step": 441 + }, + { + "epoch": 2.7116564417177913, + "grad_norm": 3.426499128341675, + "learning_rate": 4.150024280287327e-06, + "loss": 0.2418, + "step": 442 + }, + { + "epoch": 2.7177914110429446, + "grad_norm": 3.213719606399536, + "learning_rate": 4.146401365579795e-06, + "loss": 0.2549, + "step": 443 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 3.457742929458618, + "learning_rate": 4.142772334975868e-06, + "loss": 0.3822, + "step": 444 + }, + { + "epoch": 2.7300613496932513, + "grad_norm": 3.130410671234131, + "learning_rate": 4.139137201956324e-06, + "loss": 0.3107, + "step": 445 + }, + { + "epoch": 2.736196319018405, + "grad_norm": 2.7337112426757812, + "learning_rate": 4.1354959800246155e-06, + "loss": 0.2829, + "step": 446 + }, + { + "epoch": 2.7423312883435584, + "grad_norm": 3.427006483078003, + "learning_rate": 4.131848682706807e-06, + "loss": 0.3045, + "step": 447 + }, + { + "epoch": 2.7484662576687118, + "grad_norm": 3.3742318153381348, + "learning_rate": 4.128195323551536e-06, + "loss": 0.316, + "step": 448 + }, + { + "epoch": 2.754601226993865, + "grad_norm": 3.086738109588623, + "learning_rate": 4.1245359161299555e-06, + "loss": 0.5278, + "step": 449 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 3.4609954357147217, + "learning_rate": 4.120870474035687e-06, + "loss": 0.447, + "step": 450 + }, + { + "epoch": 2.766871165644172, + "grad_norm": 3.552663803100586, + "learning_rate": 4.1171990108847705e-06, + "loss": 0.6127, + "step": 451 + }, + { + "epoch": 2.773006134969325, + "grad_norm": 4.413427352905273, + "learning_rate": 4.113521540315609e-06, + "loss": 0.3304, + "step": 452 + }, + { + "epoch": 2.7791411042944785, + "grad_norm": 3.3408143520355225, + "learning_rate": 4.109838075988922e-06, + "loss": 0.5871, + "step": 453 + }, + { + "epoch": 2.785276073619632, + "grad_norm": 3.0659773349761963, + "learning_rate": 4.106148631587697e-06, + "loss": 0.3578, + "step": 454 + }, + { + "epoch": 2.791411042944785, + "grad_norm": 3.2854816913604736, + "learning_rate": 4.102453220817134e-06, + "loss": 0.4685, + "step": 455 + }, + { + "epoch": 2.7975460122699385, + "grad_norm": 3.4940855503082275, + "learning_rate": 4.098751857404595e-06, + "loss": 0.2818, + "step": 456 + }, + { + "epoch": 2.8036809815950923, + "grad_norm": 2.4630730152130127, + "learning_rate": 4.0950445550995566e-06, + "loss": 0.3497, + "step": 457 + }, + { + "epoch": 2.809815950920245, + "grad_norm": 3.3870959281921387, + "learning_rate": 4.091331327673554e-06, + "loss": 0.4954, + "step": 458 + }, + { + "epoch": 2.815950920245399, + "grad_norm": 2.3676836490631104, + "learning_rate": 4.087612188920135e-06, + "loss": 0.3884, + "step": 459 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 3.2477807998657227, + "learning_rate": 4.083887152654804e-06, + "loss": 0.375, + "step": 460 + }, + { + "epoch": 2.8282208588957056, + "grad_norm": 3.295673131942749, + "learning_rate": 4.080156232714976e-06, + "loss": 0.3272, + "step": 461 + }, + { + "epoch": 2.834355828220859, + "grad_norm": 2.800847291946411, + "learning_rate": 4.07641944295992e-06, + "loss": 0.2936, + "step": 462 + }, + { + "epoch": 2.8404907975460123, + "grad_norm": 3.443336009979248, + "learning_rate": 4.072676797270708e-06, + "loss": 0.2363, + "step": 463 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 3.1334242820739746, + "learning_rate": 4.0689283095501684e-06, + "loss": 0.4827, + "step": 464 + }, + { + "epoch": 2.852760736196319, + "grad_norm": 3.950672149658203, + "learning_rate": 4.06517399372283e-06, + "loss": 0.3163, + "step": 465 + }, + { + "epoch": 2.8588957055214723, + "grad_norm": 4.243579387664795, + "learning_rate": 4.061413863734869e-06, + "loss": 0.2827, + "step": 466 + }, + { + "epoch": 2.8650306748466257, + "grad_norm": 4.076017379760742, + "learning_rate": 4.057647933554063e-06, + "loss": 0.3466, + "step": 467 + }, + { + "epoch": 2.871165644171779, + "grad_norm": 2.846989631652832, + "learning_rate": 4.053876217169734e-06, + "loss": 0.4632, + "step": 468 + }, + { + "epoch": 2.8773006134969323, + "grad_norm": 2.74981689453125, + "learning_rate": 4.050098728592698e-06, + "loss": 0.2001, + "step": 469 + }, + { + "epoch": 2.883435582822086, + "grad_norm": 3.062068462371826, + "learning_rate": 4.046315481855211e-06, + "loss": 0.5425, + "step": 470 + }, + { + "epoch": 2.889570552147239, + "grad_norm": 2.8630964756011963, + "learning_rate": 4.0425264910109245e-06, + "loss": 0.424, + "step": 471 + }, + { + "epoch": 2.895705521472393, + "grad_norm": 3.537442922592163, + "learning_rate": 4.03873177013482e-06, + "loss": 0.2443, + "step": 472 + }, + { + "epoch": 2.901840490797546, + "grad_norm": 3.128535270690918, + "learning_rate": 4.034931333323173e-06, + "loss": 0.3734, + "step": 473 + }, + { + "epoch": 2.9079754601226995, + "grad_norm": 3.021897792816162, + "learning_rate": 4.031125194693484e-06, + "loss": 0.3762, + "step": 474 + }, + { + "epoch": 2.914110429447853, + "grad_norm": 3.0943546295166016, + "learning_rate": 4.0273133683844375e-06, + "loss": 0.3721, + "step": 475 + }, + { + "epoch": 2.920245398773006, + "grad_norm": 3.443448305130005, + "learning_rate": 4.023495868555848e-06, + "loss": 0.2868, + "step": 476 + }, + { + "epoch": 2.9263803680981595, + "grad_norm": 2.865227222442627, + "learning_rate": 4.0196727093886024e-06, + "loss": 0.5086, + "step": 477 + }, + { + "epoch": 2.932515337423313, + "grad_norm": 3.1272058486938477, + "learning_rate": 4.015843905084612e-06, + "loss": 0.4616, + "step": 478 + }, + { + "epoch": 2.938650306748466, + "grad_norm": 3.0584447383880615, + "learning_rate": 4.012009469866756e-06, + "loss": 0.403, + "step": 479 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 4.42616081237793, + "learning_rate": 4.008169417978836e-06, + "loss": 0.5801, + "step": 480 + }, + { + "epoch": 2.950920245398773, + "grad_norm": 2.8444535732269287, + "learning_rate": 4.004323763685511e-06, + "loss": 0.5808, + "step": 481 + }, + { + "epoch": 2.957055214723926, + "grad_norm": 2.591719627380371, + "learning_rate": 4.0004725212722565e-06, + "loss": 0.2584, + "step": 482 + }, + { + "epoch": 2.96319018404908, + "grad_norm": 2.5496113300323486, + "learning_rate": 3.996615705045302e-06, + "loss": 0.462, + "step": 483 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 2.9932925701141357, + "learning_rate": 3.992753329331588e-06, + "loss": 0.3502, + "step": 484 + }, + { + "epoch": 2.9754601226993866, + "grad_norm": 3.136871337890625, + "learning_rate": 3.9888854084786995e-06, + "loss": 0.5989, + "step": 485 + }, + { + "epoch": 2.98159509202454, + "grad_norm": 3.6654274463653564, + "learning_rate": 3.985011956854826e-06, + "loss": 0.6772, + "step": 486 + }, + { + "epoch": 2.9877300613496933, + "grad_norm": 2.5398948192596436, + "learning_rate": 3.9811329888487004e-06, + "loss": 0.4192, + "step": 487 + }, + { + "epoch": 2.9938650306748467, + "grad_norm": 4.89943790435791, + "learning_rate": 3.977248518869545e-06, + "loss": 0.4031, + "step": 488 + }, + { + "epoch": 3.0, + "grad_norm": 3.4729995727539062, + "learning_rate": 3.973358561347024e-06, + "loss": 0.7764, + "step": 489 + }, + { + "epoch": 3.0061349693251533, + "grad_norm": 5.331607818603516, + "learning_rate": 3.969463130731183e-06, + "loss": 0.3267, + "step": 490 + }, + { + "epoch": 3.0122699386503067, + "grad_norm": 3.453650712966919, + "learning_rate": 3.965562241492401e-06, + "loss": 0.2719, + "step": 491 + }, + { + "epoch": 3.01840490797546, + "grad_norm": 3.232313632965088, + "learning_rate": 3.9616559081213335e-06, + "loss": 0.1825, + "step": 492 + }, + { + "epoch": 3.0245398773006134, + "grad_norm": 3.4860260486602783, + "learning_rate": 3.957744145128858e-06, + "loss": 0.1854, + "step": 493 + }, + { + "epoch": 3.0306748466257667, + "grad_norm": 3.4357805252075195, + "learning_rate": 3.953826967046021e-06, + "loss": 0.2224, + "step": 494 + }, + { + "epoch": 3.03680981595092, + "grad_norm": 4.557503700256348, + "learning_rate": 3.9499043884239894e-06, + "loss": 0.349, + "step": 495 + }, + { + "epoch": 3.042944785276074, + "grad_norm": 4.685214042663574, + "learning_rate": 3.945976423833987e-06, + "loss": 0.175, + "step": 496 + }, + { + "epoch": 3.049079754601227, + "grad_norm": 3.7430171966552734, + "learning_rate": 3.942043087867244e-06, + "loss": 0.2773, + "step": 497 + }, + { + "epoch": 3.0552147239263805, + "grad_norm": 3.756450653076172, + "learning_rate": 3.938104395134947e-06, + "loss": 0.4445, + "step": 498 + }, + { + "epoch": 3.061349693251534, + "grad_norm": 4.049175262451172, + "learning_rate": 3.9341603602681805e-06, + "loss": 0.3046, + "step": 499 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 3.7689461708068848, + "learning_rate": 3.930210997917871e-06, + "loss": 0.2544, + "step": 500 + }, + { + "epoch": 3.0736196319018405, + "grad_norm": 4.027602195739746, + "learning_rate": 3.92625632275474e-06, + "loss": 0.3154, + "step": 501 + }, + { + "epoch": 3.079754601226994, + "grad_norm": 2.8449292182922363, + "learning_rate": 3.922296349469239e-06, + "loss": 0.2804, + "step": 502 + }, + { + "epoch": 3.085889570552147, + "grad_norm": 2.9555234909057617, + "learning_rate": 3.918331092771505e-06, + "loss": 0.2393, + "step": 503 + }, + { + "epoch": 3.0920245398773005, + "grad_norm": 2.621042013168335, + "learning_rate": 3.914360567391296e-06, + "loss": 0.1403, + "step": 504 + }, + { + "epoch": 3.098159509202454, + "grad_norm": 3.2348620891571045, + "learning_rate": 3.910384788077949e-06, + "loss": 0.1537, + "step": 505 + }, + { + "epoch": 3.104294478527607, + "grad_norm": 3.030179977416992, + "learning_rate": 3.906403769600311e-06, + "loss": 0.2921, + "step": 506 + }, + { + "epoch": 3.1104294478527605, + "grad_norm": 3.146428346633911, + "learning_rate": 3.902417526746694e-06, + "loss": 0.2036, + "step": 507 + }, + { + "epoch": 3.116564417177914, + "grad_norm": 3.6201512813568115, + "learning_rate": 3.898426074324818e-06, + "loss": 0.2655, + "step": 508 + }, + { + "epoch": 3.1226993865030677, + "grad_norm": 3.7674012184143066, + "learning_rate": 3.8944294271617524e-06, + "loss": 0.3938, + "step": 509 + }, + { + "epoch": 3.128834355828221, + "grad_norm": 4.54722785949707, + "learning_rate": 3.890427600103865e-06, + "loss": 0.3051, + "step": 510 + }, + { + "epoch": 3.1349693251533743, + "grad_norm": 4.228236675262451, + "learning_rate": 3.886420608016767e-06, + "loss": 0.3719, + "step": 511 + }, + { + "epoch": 3.1411042944785277, + "grad_norm": 4.355110168457031, + "learning_rate": 3.882408465785252e-06, + "loss": 0.1863, + "step": 512 + }, + { + "epoch": 3.147239263803681, + "grad_norm": 3.451460838317871, + "learning_rate": 3.878391188313249e-06, + "loss": 0.1479, + "step": 513 + }, + { + "epoch": 3.1533742331288344, + "grad_norm": 4.395524501800537, + "learning_rate": 3.87436879052376e-06, + "loss": 0.238, + "step": 514 + }, + { + "epoch": 3.1595092024539877, + "grad_norm": 2.940717935562134, + "learning_rate": 3.870341287358809e-06, + "loss": 0.2069, + "step": 515 + }, + { + "epoch": 3.165644171779141, + "grad_norm": 2.5817320346832275, + "learning_rate": 3.8663086937793845e-06, + "loss": 0.1189, + "step": 516 + }, + { + "epoch": 3.1717791411042944, + "grad_norm": 3.9863343238830566, + "learning_rate": 3.862271024765385e-06, + "loss": 0.3434, + "step": 517 + }, + { + "epoch": 3.1779141104294477, + "grad_norm": 3.609004259109497, + "learning_rate": 3.8582282953155626e-06, + "loss": 0.1602, + "step": 518 + }, + { + "epoch": 3.184049079754601, + "grad_norm": 3.207533121109009, + "learning_rate": 3.854180520447465e-06, + "loss": 0.3452, + "step": 519 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 3.593388795852661, + "learning_rate": 3.850127715197387e-06, + "loss": 0.2832, + "step": 520 + }, + { + "epoch": 3.196319018404908, + "grad_norm": 3.409064531326294, + "learning_rate": 3.846069894620306e-06, + "loss": 0.1481, + "step": 521 + }, + { + "epoch": 3.2024539877300615, + "grad_norm": 3.461498737335205, + "learning_rate": 3.84200707378983e-06, + "loss": 0.1283, + "step": 522 + }, + { + "epoch": 3.208588957055215, + "grad_norm": 3.708467483520508, + "learning_rate": 3.8379392677981434e-06, + "loss": 0.2468, + "step": 523 + }, + { + "epoch": 3.214723926380368, + "grad_norm": 2.802381753921509, + "learning_rate": 3.833866491755947e-06, + "loss": 0.2685, + "step": 524 + }, + { + "epoch": 3.2208588957055215, + "grad_norm": 3.0787744522094727, + "learning_rate": 3.8297887607924044e-06, + "loss": 0.2595, + "step": 525 + }, + { + "epoch": 3.226993865030675, + "grad_norm": 3.3952548503875732, + "learning_rate": 3.825706090055088e-06, + "loss": 0.4099, + "step": 526 + }, + { + "epoch": 3.233128834355828, + "grad_norm": 3.3497085571289062, + "learning_rate": 3.821618494709916e-06, + "loss": 0.287, + "step": 527 + }, + { + "epoch": 3.2392638036809815, + "grad_norm": 4.050611972808838, + "learning_rate": 3.817525989941102e-06, + "loss": 0.2369, + "step": 528 + }, + { + "epoch": 3.245398773006135, + "grad_norm": 2.87642240524292, + "learning_rate": 3.8134285909510972e-06, + "loss": 0.2751, + "step": 529 + }, + { + "epoch": 3.2515337423312882, + "grad_norm": 3.821941614151001, + "learning_rate": 3.8093263129605305e-06, + "loss": 0.2363, + "step": 530 + }, + { + "epoch": 3.2576687116564416, + "grad_norm": 2.8066117763519287, + "learning_rate": 3.80521917120816e-06, + "loss": 0.094, + "step": 531 + }, + { + "epoch": 3.263803680981595, + "grad_norm": 3.849768877029419, + "learning_rate": 3.801107180950806e-06, + "loss": 0.4117, + "step": 532 + }, + { + "epoch": 3.2699386503067487, + "grad_norm": 2.4161250591278076, + "learning_rate": 3.7969903574633028e-06, + "loss": 0.1183, + "step": 533 + }, + { + "epoch": 3.276073619631902, + "grad_norm": 3.6743111610412598, + "learning_rate": 3.792868716038437e-06, + "loss": 0.2296, + "step": 534 + }, + { + "epoch": 3.2822085889570554, + "grad_norm": 4.378123760223389, + "learning_rate": 3.7887422719868937e-06, + "loss": 0.2678, + "step": 535 + }, + { + "epoch": 3.2883435582822087, + "grad_norm": 4.816481590270996, + "learning_rate": 3.784611040637198e-06, + "loss": 0.4887, + "step": 536 + }, + { + "epoch": 3.294478527607362, + "grad_norm": 3.5712430477142334, + "learning_rate": 3.7804750373356576e-06, + "loss": 0.3827, + "step": 537 + }, + { + "epoch": 3.3006134969325154, + "grad_norm": 3.6877355575561523, + "learning_rate": 3.776334277446307e-06, + "loss": 0.3233, + "step": 538 + }, + { + "epoch": 3.3067484662576687, + "grad_norm": 3.442706346511841, + "learning_rate": 3.7721887763508512e-06, + "loss": 0.1256, + "step": 539 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 3.9265615940093994, + "learning_rate": 3.7680385494486053e-06, + "loss": 0.3845, + "step": 540 + }, + { + "epoch": 3.3190184049079754, + "grad_norm": 3.5030126571655273, + "learning_rate": 3.7638836121564414e-06, + "loss": 0.2905, + "step": 541 + }, + { + "epoch": 3.3251533742331287, + "grad_norm": 3.6685378551483154, + "learning_rate": 3.7597239799087283e-06, + "loss": 0.3561, + "step": 542 + }, + { + "epoch": 3.331288343558282, + "grad_norm": 3.8484046459198, + "learning_rate": 3.7555596681572736e-06, + "loss": 0.1157, + "step": 543 + }, + { + "epoch": 3.3374233128834354, + "grad_norm": 3.7977402210235596, + "learning_rate": 3.751390692371272e-06, + "loss": 0.3049, + "step": 544 + }, + { + "epoch": 3.3435582822085887, + "grad_norm": 3.4409852027893066, + "learning_rate": 3.7472170680372398e-06, + "loss": 0.1626, + "step": 545 + }, + { + "epoch": 3.3496932515337425, + "grad_norm": 3.801541328430176, + "learning_rate": 3.7430388106589632e-06, + "loss": 0.2414, + "step": 546 + }, + { + "epoch": 3.355828220858896, + "grad_norm": 4.025203704833984, + "learning_rate": 3.738855935757438e-06, + "loss": 0.3441, + "step": 547 + }, + { + "epoch": 3.361963190184049, + "grad_norm": 4.242798805236816, + "learning_rate": 3.7346684588708135e-06, + "loss": 0.5244, + "step": 548 + }, + { + "epoch": 3.3680981595092025, + "grad_norm": 3.0516819953918457, + "learning_rate": 3.7304763955543332e-06, + "loss": 0.1984, + "step": 549 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 3.894667625427246, + "learning_rate": 3.726279761380279e-06, + "loss": 0.2715, + "step": 550 + }, + { + "epoch": 3.3803680981595092, + "grad_norm": 3.171208143234253, + "learning_rate": 3.72207857193791e-06, + "loss": 0.1537, + "step": 551 + }, + { + "epoch": 3.3865030674846626, + "grad_norm": 4.344860553741455, + "learning_rate": 3.7178728428334092e-06, + "loss": 0.2388, + "step": 552 + }, + { + "epoch": 3.392638036809816, + "grad_norm": 2.766317367553711, + "learning_rate": 3.7136625896898226e-06, + "loss": 0.1726, + "step": 553 + }, + { + "epoch": 3.3987730061349692, + "grad_norm": 3.550662040710449, + "learning_rate": 3.7094478281470003e-06, + "loss": 0.2942, + "step": 554 + }, + { + "epoch": 3.4049079754601226, + "grad_norm": 3.4576945304870605, + "learning_rate": 3.7052285738615412e-06, + "loss": 0.1665, + "step": 555 + }, + { + "epoch": 3.411042944785276, + "grad_norm": 4.026793003082275, + "learning_rate": 3.7010048425067317e-06, + "loss": 0.3954, + "step": 556 + }, + { + "epoch": 3.4171779141104293, + "grad_norm": 4.600133419036865, + "learning_rate": 3.696776649772492e-06, + "loss": 0.3207, + "step": 557 + }, + { + "epoch": 3.4233128834355826, + "grad_norm": 4.747331142425537, + "learning_rate": 3.692544011365312e-06, + "loss": 0.1325, + "step": 558 + }, + { + "epoch": 3.4294478527607364, + "grad_norm": 3.781464099884033, + "learning_rate": 3.6883069430081986e-06, + "loss": 0.1644, + "step": 559 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 2.905986785888672, + "learning_rate": 3.6840654604406135e-06, + "loss": 0.2469, + "step": 560 + }, + { + "epoch": 3.441717791411043, + "grad_norm": 2.3747711181640625, + "learning_rate": 3.679819579418414e-06, + "loss": 0.1146, + "step": 561 + }, + { + "epoch": 3.4478527607361964, + "grad_norm": 3.2683632373809814, + "learning_rate": 3.6755693157137995e-06, + "loss": 0.3236, + "step": 562 + }, + { + "epoch": 3.4539877300613497, + "grad_norm": 3.7750496864318848, + "learning_rate": 3.6713146851152487e-06, + "loss": 0.399, + "step": 563 + }, + { + "epoch": 3.460122699386503, + "grad_norm": 3.3912384510040283, + "learning_rate": 3.667055703427461e-06, + "loss": 0.1259, + "step": 564 + }, + { + "epoch": 3.4662576687116564, + "grad_norm": 3.0224430561065674, + "learning_rate": 3.6627923864713e-06, + "loss": 0.1835, + "step": 565 + }, + { + "epoch": 3.4723926380368098, + "grad_norm": 3.642258405685425, + "learning_rate": 3.658524750083733e-06, + "loss": 0.2763, + "step": 566 + }, + { + "epoch": 3.478527607361963, + "grad_norm": 3.409890651702881, + "learning_rate": 3.654252810117773e-06, + "loss": 0.2496, + "step": 567 + }, + { + "epoch": 3.4846625766871164, + "grad_norm": 3.0416476726531982, + "learning_rate": 3.6499765824424195e-06, + "loss": 0.1287, + "step": 568 + }, + { + "epoch": 3.4907975460122698, + "grad_norm": 3.1963987350463867, + "learning_rate": 3.6456960829425987e-06, + "loss": 0.1747, + "step": 569 + }, + { + "epoch": 3.4969325153374236, + "grad_norm": 3.198448657989502, + "learning_rate": 3.641411327519107e-06, + "loss": 0.1913, + "step": 570 + }, + { + "epoch": 3.5030674846625764, + "grad_norm": 3.7023441791534424, + "learning_rate": 3.6371223320885492e-06, + "loss": 0.3224, + "step": 571 + }, + { + "epoch": 3.5092024539877302, + "grad_norm": 4.54288387298584, + "learning_rate": 3.6328291125832803e-06, + "loss": 0.2364, + "step": 572 + }, + { + "epoch": 3.5153374233128836, + "grad_norm": 3.5064890384674072, + "learning_rate": 3.628531684951347e-06, + "loss": 0.2552, + "step": 573 + }, + { + "epoch": 3.521472392638037, + "grad_norm": 3.987583875656128, + "learning_rate": 3.6242300651564276e-06, + "loss": 0.3232, + "step": 574 + }, + { + "epoch": 3.5276073619631902, + "grad_norm": 3.179642915725708, + "learning_rate": 3.6199242691777745e-06, + "loss": 0.32, + "step": 575 + }, + { + "epoch": 3.5337423312883436, + "grad_norm": 3.3078157901763916, + "learning_rate": 3.6156143130101516e-06, + "loss": 0.2922, + "step": 576 + }, + { + "epoch": 3.539877300613497, + "grad_norm": 3.1628613471984863, + "learning_rate": 3.6113002126637765e-06, + "loss": 0.2005, + "step": 577 + }, + { + "epoch": 3.5460122699386503, + "grad_norm": 3.4515540599823, + "learning_rate": 3.606981984164263e-06, + "loss": 0.2138, + "step": 578 + }, + { + "epoch": 3.5521472392638036, + "grad_norm": 5.132473945617676, + "learning_rate": 3.6026596435525578e-06, + "loss": 0.4382, + "step": 579 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 3.397614002227783, + "learning_rate": 3.5983332068848855e-06, + "loss": 0.3326, + "step": 580 + }, + { + "epoch": 3.5644171779141103, + "grad_norm": 4.79497766494751, + "learning_rate": 3.5940026902326825e-06, + "loss": 0.4748, + "step": 581 + }, + { + "epoch": 3.5705521472392636, + "grad_norm": 3.7675018310546875, + "learning_rate": 3.5896681096825446e-06, + "loss": 0.2692, + "step": 582 + }, + { + "epoch": 3.5766871165644174, + "grad_norm": 3.0637521743774414, + "learning_rate": 3.5853294813361614e-06, + "loss": 0.3658, + "step": 583 + }, + { + "epoch": 3.5828220858895703, + "grad_norm": 2.8949790000915527, + "learning_rate": 3.5809868213102623e-06, + "loss": 0.1661, + "step": 584 + }, + { + "epoch": 3.588957055214724, + "grad_norm": 3.163419246673584, + "learning_rate": 3.5766401457365485e-06, + "loss": 0.1233, + "step": 585 + }, + { + "epoch": 3.5950920245398774, + "grad_norm": 3.1787965297698975, + "learning_rate": 3.5722894707616417e-06, + "loss": 0.278, + "step": 586 + }, + { + "epoch": 3.6012269938650308, + "grad_norm": 2.9397857189178467, + "learning_rate": 3.5679348125470175e-06, + "loss": 0.1541, + "step": 587 + }, + { + "epoch": 3.607361963190184, + "grad_norm": 3.2690396308898926, + "learning_rate": 3.56357618726895e-06, + "loss": 0.1575, + "step": 588 + }, + { + "epoch": 3.6134969325153374, + "grad_norm": 5.444014072418213, + "learning_rate": 3.5592136111184483e-06, + "loss": 0.8079, + "step": 589 + }, + { + "epoch": 3.6196319018404908, + "grad_norm": 3.1688313484191895, + "learning_rate": 3.554847100301199e-06, + "loss": 0.341, + "step": 590 + }, + { + "epoch": 3.625766871165644, + "grad_norm": 2.469212532043457, + "learning_rate": 3.550476671037505e-06, + "loss": 0.1625, + "step": 591 + }, + { + "epoch": 3.6319018404907975, + "grad_norm": 3.3956527709960938, + "learning_rate": 3.546102339562223e-06, + "loss": 0.199, + "step": 592 + }, + { + "epoch": 3.638036809815951, + "grad_norm": 2.7287702560424805, + "learning_rate": 3.5417241221247078e-06, + "loss": 0.1493, + "step": 593 + }, + { + "epoch": 3.644171779141104, + "grad_norm": 3.5046865940093994, + "learning_rate": 3.5373420349887477e-06, + "loss": 0.2765, + "step": 594 + }, + { + "epoch": 3.6503067484662575, + "grad_norm": 3.121476650238037, + "learning_rate": 3.5329560944325065e-06, + "loss": 0.2833, + "step": 595 + }, + { + "epoch": 3.6564417177914113, + "grad_norm": 3.276463270187378, + "learning_rate": 3.528566316748462e-06, + "loss": 0.1237, + "step": 596 + }, + { + "epoch": 3.662576687116564, + "grad_norm": 3.382840633392334, + "learning_rate": 3.524172718243347e-06, + "loss": 0.1599, + "step": 597 + }, + { + "epoch": 3.668711656441718, + "grad_norm": 4.801311492919922, + "learning_rate": 3.5197753152380854e-06, + "loss": 0.2997, + "step": 598 + }, + { + "epoch": 3.6748466257668713, + "grad_norm": 4.117336273193359, + "learning_rate": 3.515374124067736e-06, + "loss": 0.2021, + "step": 599 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 3.611438035964966, + "learning_rate": 3.5109691610814263e-06, + "loss": 0.1726, + "step": 600 + }, + { + "epoch": 3.687116564417178, + "grad_norm": 4.5179972648620605, + "learning_rate": 3.5065604426422995e-06, + "loss": 0.1377, + "step": 601 + }, + { + "epoch": 3.6932515337423313, + "grad_norm": 3.561061382293701, + "learning_rate": 3.502147985127445e-06, + "loss": 0.1497, + "step": 602 + }, + { + "epoch": 3.6993865030674846, + "grad_norm": 3.3497917652130127, + "learning_rate": 3.4977318049278443e-06, + "loss": 0.1589, + "step": 603 + }, + { + "epoch": 3.705521472392638, + "grad_norm": 3.2725470066070557, + "learning_rate": 3.4933119184483065e-06, + "loss": 0.1364, + "step": 604 + }, + { + "epoch": 3.7116564417177913, + "grad_norm": 3.228956460952759, + "learning_rate": 3.4888883421074076e-06, + "loss": 0.177, + "step": 605 + }, + { + "epoch": 3.7177914110429446, + "grad_norm": 3.7648911476135254, + "learning_rate": 3.484461092337434e-06, + "loss": 0.122, + "step": 606 + }, + { + "epoch": 3.7239263803680984, + "grad_norm": 3.5322585105895996, + "learning_rate": 3.4800301855843137e-06, + "loss": 0.2664, + "step": 607 + }, + { + "epoch": 3.7300613496932513, + "grad_norm": 2.951073169708252, + "learning_rate": 3.4755956383075613e-06, + "loss": 0.12, + "step": 608 + }, + { + "epoch": 3.736196319018405, + "grad_norm": 3.0577664375305176, + "learning_rate": 3.471157466980214e-06, + "loss": 0.3926, + "step": 609 + }, + { + "epoch": 3.7423312883435584, + "grad_norm": 4.089846134185791, + "learning_rate": 3.466715688088772e-06, + "loss": 0.6233, + "step": 610 + }, + { + "epoch": 3.7484662576687118, + "grad_norm": 3.081340789794922, + "learning_rate": 3.462270318133136e-06, + "loss": 0.2456, + "step": 611 + }, + { + "epoch": 3.754601226993865, + "grad_norm": 3.034712553024292, + "learning_rate": 3.4578213736265474e-06, + "loss": 0.2683, + "step": 612 + }, + { + "epoch": 3.7607361963190185, + "grad_norm": 3.459815740585327, + "learning_rate": 3.4533688710955255e-06, + "loss": 0.3796, + "step": 613 + }, + { + "epoch": 3.766871165644172, + "grad_norm": 3.523737907409668, + "learning_rate": 3.448912827079805e-06, + "loss": 0.3326, + "step": 614 + }, + { + "epoch": 3.773006134969325, + "grad_norm": 3.333219289779663, + "learning_rate": 3.4444532581322793e-06, + "loss": 0.206, + "step": 615 + }, + { + "epoch": 3.7791411042944785, + "grad_norm": 3.582387685775757, + "learning_rate": 3.4399901808189327e-06, + "loss": 0.244, + "step": 616 + }, + { + "epoch": 3.785276073619632, + "grad_norm": 3.4887266159057617, + "learning_rate": 3.435523611718785e-06, + "loss": 0.1796, + "step": 617 + }, + { + "epoch": 3.791411042944785, + "grad_norm": 4.89408016204834, + "learning_rate": 3.4310535674238242e-06, + "loss": 0.188, + "step": 618 + }, + { + "epoch": 3.7975460122699385, + "grad_norm": 4.338910102844238, + "learning_rate": 3.42658006453895e-06, + "loss": 0.3039, + "step": 619 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 4.107708930969238, + "learning_rate": 3.4221031196819083e-06, + "loss": 0.3383, + "step": 620 + }, + { + "epoch": 3.809815950920245, + "grad_norm": 3.698777675628662, + "learning_rate": 3.4176227494832305e-06, + "loss": 0.1721, + "step": 621 + }, + { + "epoch": 3.815950920245399, + "grad_norm": 2.6659226417541504, + "learning_rate": 3.413138970586174e-06, + "loss": 0.2211, + "step": 622 + }, + { + "epoch": 3.8220858895705523, + "grad_norm": 3.2398436069488525, + "learning_rate": 3.4086517996466574e-06, + "loss": 0.1871, + "step": 623 + }, + { + "epoch": 3.8282208588957056, + "grad_norm": 4.9128804206848145, + "learning_rate": 3.404161253333199e-06, + "loss": 0.3874, + "step": 624 + }, + { + "epoch": 3.834355828220859, + "grad_norm": 3.508789300918579, + "learning_rate": 3.3996673483268573e-06, + "loss": 0.1739, + "step": 625 + }, + { + "epoch": 3.8404907975460123, + "grad_norm": 3.3016927242279053, + "learning_rate": 3.3951701013211665e-06, + "loss": 0.274, + "step": 626 + }, + { + "epoch": 3.8466257668711656, + "grad_norm": 3.8941333293914795, + "learning_rate": 3.3906695290220736e-06, + "loss": 0.3568, + "step": 627 + }, + { + "epoch": 3.852760736196319, + "grad_norm": 3.512354850769043, + "learning_rate": 3.3861656481478816e-06, + "loss": 0.157, + "step": 628 + }, + { + "epoch": 3.8588957055214723, + "grad_norm": 3.482649326324463, + "learning_rate": 3.3816584754291814e-06, + "loss": 0.1218, + "step": 629 + }, + { + "epoch": 3.8650306748466257, + "grad_norm": 3.1490275859832764, + "learning_rate": 3.377148027608793e-06, + "loss": 0.2234, + "step": 630 + }, + { + "epoch": 3.871165644171779, + "grad_norm": 3.2172653675079346, + "learning_rate": 3.3726343214417023e-06, + "loss": 0.3329, + "step": 631 + }, + { + "epoch": 3.8773006134969323, + "grad_norm": 4.167707443237305, + "learning_rate": 3.3681173736949984e-06, + "loss": 0.1384, + "step": 632 + }, + { + "epoch": 3.883435582822086, + "grad_norm": 3.4743919372558594, + "learning_rate": 3.3635972011478134e-06, + "loss": 0.3807, + "step": 633 + }, + { + "epoch": 3.889570552147239, + "grad_norm": 3.6892173290252686, + "learning_rate": 3.3590738205912566e-06, + "loss": 0.194, + "step": 634 + }, + { + "epoch": 3.895705521472393, + "grad_norm": 3.262967824935913, + "learning_rate": 3.354547248828356e-06, + "loss": 0.202, + "step": 635 + }, + { + "epoch": 3.901840490797546, + "grad_norm": 3.8871562480926514, + "learning_rate": 3.3500175026739916e-06, + "loss": 0.2471, + "step": 636 + }, + { + "epoch": 3.9079754601226995, + "grad_norm": 3.5097084045410156, + "learning_rate": 3.3454845989548385e-06, + "loss": 0.1112, + "step": 637 + }, + { + "epoch": 3.914110429447853, + "grad_norm": 4.163944721221924, + "learning_rate": 3.3409485545092995e-06, + "loss": 0.3368, + "step": 638 + }, + { + "epoch": 3.920245398773006, + "grad_norm": 3.6405045986175537, + "learning_rate": 3.336409386187444e-06, + "loss": 0.1863, + "step": 639 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 3.2477526664733887, + "learning_rate": 3.331867110850946e-06, + "loss": 0.1491, + "step": 640 + }, + { + "epoch": 3.932515337423313, + "grad_norm": 3.933753490447998, + "learning_rate": 3.327321745373021e-06, + "loss": 0.2484, + "step": 641 + }, + { + "epoch": 3.938650306748466, + "grad_norm": 3.2475059032440186, + "learning_rate": 3.322773306638364e-06, + "loss": 0.2126, + "step": 642 + }, + { + "epoch": 3.9447852760736195, + "grad_norm": 2.628467321395874, + "learning_rate": 3.318221811543086e-06, + "loss": 0.1649, + "step": 643 + }, + { + "epoch": 3.950920245398773, + "grad_norm": 3.2612411975860596, + "learning_rate": 3.313667276994651e-06, + "loss": 0.1442, + "step": 644 + }, + { + "epoch": 3.957055214723926, + "grad_norm": 3.8058395385742188, + "learning_rate": 3.309109719911814e-06, + "loss": 0.359, + "step": 645 + }, + { + "epoch": 3.96319018404908, + "grad_norm": 3.3450071811676025, + "learning_rate": 3.304549157224558e-06, + "loss": 0.4042, + "step": 646 + }, + { + "epoch": 3.969325153374233, + "grad_norm": 3.079601287841797, + "learning_rate": 3.299985605874031e-06, + "loss": 0.1699, + "step": 647 + }, + { + "epoch": 3.9754601226993866, + "grad_norm": 3.8963980674743652, + "learning_rate": 3.295419082812483e-06, + "loss": 0.1888, + "step": 648 + }, + { + "epoch": 3.98159509202454, + "grad_norm": 3.307405948638916, + "learning_rate": 3.2908496050032024e-06, + "loss": 0.2824, + "step": 649 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 3.227478265762329, + "learning_rate": 3.2862771894204544e-06, + "loss": 0.3038, + "step": 650 + }, + { + "epoch": 3.9938650306748467, + "grad_norm": 4.046506881713867, + "learning_rate": 3.2817018530494164e-06, + "loss": 0.3266, + "step": 651 + }, + { + "epoch": 4.0, + "grad_norm": 7.775874614715576, + "learning_rate": 3.277123612886116e-06, + "loss": 0.2998, + "step": 652 + }, + { + "epoch": 4.006134969325154, + "grad_norm": 3.146462917327881, + "learning_rate": 3.272542485937369e-06, + "loss": 0.2764, + "step": 653 + }, + { + "epoch": 4.012269938650307, + "grad_norm": 3.0539863109588623, + "learning_rate": 3.2679584892207118e-06, + "loss": 0.1157, + "step": 654 + }, + { + "epoch": 4.0184049079754605, + "grad_norm": 3.634021520614624, + "learning_rate": 3.263371639764343e-06, + "loss": 0.0707, + "step": 655 + }, + { + "epoch": 4.024539877300613, + "grad_norm": 3.3474650382995605, + "learning_rate": 3.2587819546070596e-06, + "loss": 0.1067, + "step": 656 + }, + { + "epoch": 4.030674846625767, + "grad_norm": 4.409244537353516, + "learning_rate": 3.254189450798189e-06, + "loss": 0.0564, + "step": 657 + }, + { + "epoch": 4.03680981595092, + "grad_norm": 3.0446252822875977, + "learning_rate": 3.2495941453975312e-06, + "loss": 0.0535, + "step": 658 + }, + { + "epoch": 4.042944785276074, + "grad_norm": 4.014753818511963, + "learning_rate": 3.2449960554752935e-06, + "loss": 0.1245, + "step": 659 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 3.188062906265259, + "learning_rate": 3.240395198112026e-06, + "loss": 0.0626, + "step": 660 + }, + { + "epoch": 4.0552147239263805, + "grad_norm": 3.006086826324463, + "learning_rate": 3.2357915903985605e-06, + "loss": 0.1198, + "step": 661 + }, + { + "epoch": 4.061349693251533, + "grad_norm": 2.8865551948547363, + "learning_rate": 3.2311852494359423e-06, + "loss": 0.0454, + "step": 662 + }, + { + "epoch": 4.067484662576687, + "grad_norm": 4.2888007164001465, + "learning_rate": 3.226576192335373e-06, + "loss": 0.2064, + "step": 663 + }, + { + "epoch": 4.07361963190184, + "grad_norm": 3.1414525508880615, + "learning_rate": 3.2219644362181436e-06, + "loss": 0.2183, + "step": 664 + }, + { + "epoch": 4.079754601226994, + "grad_norm": 2.556277275085449, + "learning_rate": 3.21734999821557e-06, + "loss": 0.0516, + "step": 665 + }, + { + "epoch": 4.085889570552148, + "grad_norm": 2.698118209838867, + "learning_rate": 3.2127328954689307e-06, + "loss": 0.0613, + "step": 666 + }, + { + "epoch": 4.0920245398773005, + "grad_norm": 2.869919538497925, + "learning_rate": 3.2081131451294025e-06, + "loss": 0.0583, + "step": 667 + }, + { + "epoch": 4.098159509202454, + "grad_norm": 3.8786919116973877, + "learning_rate": 3.2034907643579988e-06, + "loss": 0.0766, + "step": 668 + }, + { + "epoch": 4.104294478527607, + "grad_norm": 4.224637031555176, + "learning_rate": 3.1988657703255043e-06, + "loss": 0.1099, + "step": 669 + }, + { + "epoch": 4.110429447852761, + "grad_norm": 4.671669006347656, + "learning_rate": 3.194238180212409e-06, + "loss": 0.1663, + "step": 670 + }, + { + "epoch": 4.116564417177914, + "grad_norm": 3.2484257221221924, + "learning_rate": 3.1896080112088477e-06, + "loss": 0.0587, + "step": 671 + }, + { + "epoch": 4.122699386503068, + "grad_norm": 2.4808075428009033, + "learning_rate": 3.184975280514536e-06, + "loss": 0.0579, + "step": 672 + }, + { + "epoch": 4.128834355828221, + "grad_norm": 3.7106919288635254, + "learning_rate": 3.1803400053387044e-06, + "loss": 0.1083, + "step": 673 + }, + { + "epoch": 4.134969325153374, + "grad_norm": 3.008970260620117, + "learning_rate": 3.175702202900036e-06, + "loss": 0.1355, + "step": 674 + }, + { + "epoch": 4.141104294478527, + "grad_norm": 3.2640793323516846, + "learning_rate": 3.1710618904266006e-06, + "loss": 0.092, + "step": 675 + }, + { + "epoch": 4.147239263803681, + "grad_norm": 3.08042049407959, + "learning_rate": 3.166419085155793e-06, + "loss": 0.0563, + "step": 676 + }, + { + "epoch": 4.153374233128835, + "grad_norm": 2.993530511856079, + "learning_rate": 3.1617738043342695e-06, + "loss": 0.1773, + "step": 677 + }, + { + "epoch": 4.159509202453988, + "grad_norm": 2.6218204498291016, + "learning_rate": 3.157126065217879e-06, + "loss": 0.0489, + "step": 678 + }, + { + "epoch": 4.1656441717791415, + "grad_norm": 4.3173723220825195, + "learning_rate": 3.152475885071606e-06, + "loss": 0.1333, + "step": 679 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 3.659149408340454, + "learning_rate": 3.147823281169498e-06, + "loss": 0.1501, + "step": 680 + }, + { + "epoch": 4.177914110429448, + "grad_norm": 3.0953338146209717, + "learning_rate": 3.143168270794612e-06, + "loss": 0.1067, + "step": 681 + }, + { + "epoch": 4.184049079754601, + "grad_norm": 3.5693907737731934, + "learning_rate": 3.1385108712389394e-06, + "loss": 0.2499, + "step": 682 + }, + { + "epoch": 4.190184049079755, + "grad_norm": 3.3022868633270264, + "learning_rate": 3.1338510998033484e-06, + "loss": 0.1748, + "step": 683 + }, + { + "epoch": 4.196319018404908, + "grad_norm": 3.7468113899230957, + "learning_rate": 3.129188973797519e-06, + "loss": 0.201, + "step": 684 + }, + { + "epoch": 4.2024539877300615, + "grad_norm": 2.8381078243255615, + "learning_rate": 3.124524510539875e-06, + "loss": 0.0735, + "step": 685 + }, + { + "epoch": 4.208588957055214, + "grad_norm": 2.84706974029541, + "learning_rate": 3.119857727357527e-06, + "loss": 0.1806, + "step": 686 + }, + { + "epoch": 4.214723926380368, + "grad_norm": 3.8130292892456055, + "learning_rate": 3.1151886415861993e-06, + "loss": 0.1811, + "step": 687 + }, + { + "epoch": 4.220858895705521, + "grad_norm": 3.528895378112793, + "learning_rate": 3.1105172705701708e-06, + "loss": 0.1634, + "step": 688 + }, + { + "epoch": 4.226993865030675, + "grad_norm": 5.028727054595947, + "learning_rate": 3.1058436316622103e-06, + "loss": 0.1625, + "step": 689 + }, + { + "epoch": 4.233128834355828, + "grad_norm": 4.606889247894287, + "learning_rate": 3.1011677422235093e-06, + "loss": 0.1791, + "step": 690 + }, + { + "epoch": 4.2392638036809815, + "grad_norm": 3.3620636463165283, + "learning_rate": 3.0964896196236217e-06, + "loss": 0.2233, + "step": 691 + }, + { + "epoch": 4.245398773006135, + "grad_norm": 3.7845852375030518, + "learning_rate": 3.0918092812403954e-06, + "loss": 0.1142, + "step": 692 + }, + { + "epoch": 4.251533742331288, + "grad_norm": 3.1204118728637695, + "learning_rate": 3.0871267444599098e-06, + "loss": 0.096, + "step": 693 + }, + { + "epoch": 4.257668711656442, + "grad_norm": 3.686067819595337, + "learning_rate": 3.0824420266764093e-06, + "loss": 0.2749, + "step": 694 + }, + { + "epoch": 4.263803680981595, + "grad_norm": 3.1680829524993896, + "learning_rate": 3.077755145292243e-06, + "loss": 0.2504, + "step": 695 + }, + { + "epoch": 4.269938650306749, + "grad_norm": 3.3179469108581543, + "learning_rate": 3.0730661177177957e-06, + "loss": 0.1324, + "step": 696 + }, + { + "epoch": 4.276073619631902, + "grad_norm": 3.1186370849609375, + "learning_rate": 3.0683749613714238e-06, + "loss": 0.0691, + "step": 697 + }, + { + "epoch": 4.282208588957055, + "grad_norm": 3.086834192276001, + "learning_rate": 3.063681693679391e-06, + "loss": 0.1026, + "step": 698 + }, + { + "epoch": 4.288343558282208, + "grad_norm": 4.629584312438965, + "learning_rate": 3.0589863320758063e-06, + "loss": 0.2646, + "step": 699 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 3.9641213417053223, + "learning_rate": 3.0542888940025562e-06, + "loss": 0.1711, + "step": 700 + }, + { + "epoch": 4.300613496932515, + "grad_norm": 3.75014328956604, + "learning_rate": 3.0495893969092395e-06, + "loss": 0.0589, + "step": 701 + }, + { + "epoch": 4.306748466257669, + "grad_norm": 3.603290319442749, + "learning_rate": 3.044887858253105e-06, + "loss": 0.2244, + "step": 702 + }, + { + "epoch": 4.3128834355828225, + "grad_norm": 3.79404616355896, + "learning_rate": 3.040184295498984e-06, + "loss": 0.1506, + "step": 703 + }, + { + "epoch": 4.319018404907975, + "grad_norm": 3.0890021324157715, + "learning_rate": 3.035478726119228e-06, + "loss": 0.2343, + "step": 704 + }, + { + "epoch": 4.325153374233129, + "grad_norm": 3.6688191890716553, + "learning_rate": 3.0307711675936426e-06, + "loss": 0.0518, + "step": 705 + }, + { + "epoch": 4.331288343558282, + "grad_norm": 5.1836700439453125, + "learning_rate": 3.0260616374094208e-06, + "loss": 0.2363, + "step": 706 + }, + { + "epoch": 4.337423312883436, + "grad_norm": 2.7123284339904785, + "learning_rate": 3.0213501530610807e-06, + "loss": 0.0848, + "step": 707 + }, + { + "epoch": 4.343558282208589, + "grad_norm": 3.5661890506744385, + "learning_rate": 3.0166367320504005e-06, + "loss": 0.149, + "step": 708 + }, + { + "epoch": 4.3496932515337425, + "grad_norm": 3.6454737186431885, + "learning_rate": 3.0119213918863515e-06, + "loss": 0.1133, + "step": 709 + }, + { + "epoch": 4.355828220858895, + "grad_norm": 3.7534968852996826, + "learning_rate": 3.0072041500850343e-06, + "loss": 0.1358, + "step": 710 + }, + { + "epoch": 4.361963190184049, + "grad_norm": 3.40387225151062, + "learning_rate": 3.0024850241696128e-06, + "loss": 0.0706, + "step": 711 + }, + { + "epoch": 4.368098159509202, + "grad_norm": 3.250471591949463, + "learning_rate": 2.9977640316702512e-06, + "loss": 0.1977, + "step": 712 + }, + { + "epoch": 4.374233128834356, + "grad_norm": 3.417781352996826, + "learning_rate": 2.993041190124047e-06, + "loss": 0.2622, + "step": 713 + }, + { + "epoch": 4.38036809815951, + "grad_norm": 2.628434181213379, + "learning_rate": 2.9883165170749657e-06, + "loss": 0.1487, + "step": 714 + }, + { + "epoch": 4.386503067484663, + "grad_norm": 3.240264892578125, + "learning_rate": 2.9835900300737763e-06, + "loss": 0.0822, + "step": 715 + }, + { + "epoch": 4.392638036809816, + "grad_norm": 6.575517177581787, + "learning_rate": 2.9788617466779884e-06, + "loss": 0.3668, + "step": 716 + }, + { + "epoch": 4.398773006134969, + "grad_norm": 4.699089050292969, + "learning_rate": 2.974131684451781e-06, + "loss": 0.2432, + "step": 717 + }, + { + "epoch": 4.404907975460123, + "grad_norm": 2.9815752506256104, + "learning_rate": 2.9693998609659443e-06, + "loss": 0.0689, + "step": 718 + }, + { + "epoch": 4.411042944785276, + "grad_norm": 4.192755222320557, + "learning_rate": 2.9646662937978082e-06, + "loss": 0.1897, + "step": 719 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 2.9729068279266357, + "learning_rate": 2.9599310005311824e-06, + "loss": 0.0457, + "step": 720 + }, + { + "epoch": 4.423312883435583, + "grad_norm": 4.234438896179199, + "learning_rate": 2.9551939987562866e-06, + "loss": 0.2307, + "step": 721 + }, + { + "epoch": 4.429447852760736, + "grad_norm": 3.3982434272766113, + "learning_rate": 2.950455306069688e-06, + "loss": 0.0637, + "step": 722 + }, + { + "epoch": 4.435582822085889, + "grad_norm": 4.539764404296875, + "learning_rate": 2.9457149400742357e-06, + "loss": 0.1924, + "step": 723 + }, + { + "epoch": 4.441717791411043, + "grad_norm": 4.039684772491455, + "learning_rate": 2.940972918378993e-06, + "loss": 0.1275, + "step": 724 + }, + { + "epoch": 4.447852760736196, + "grad_norm": 4.340360641479492, + "learning_rate": 2.936229258599174e-06, + "loss": 0.123, + "step": 725 + }, + { + "epoch": 4.45398773006135, + "grad_norm": 2.8720109462738037, + "learning_rate": 2.93148397835608e-06, + "loss": 0.0555, + "step": 726 + }, + { + "epoch": 4.460122699386503, + "grad_norm": 4.227811336517334, + "learning_rate": 2.926737095277029e-06, + "loss": 0.0991, + "step": 727 + }, + { + "epoch": 4.466257668711656, + "grad_norm": 2.8079142570495605, + "learning_rate": 2.921988626995295e-06, + "loss": 0.0628, + "step": 728 + }, + { + "epoch": 4.47239263803681, + "grad_norm": 4.195122241973877, + "learning_rate": 2.9172385911500385e-06, + "loss": 0.2333, + "step": 729 + }, + { + "epoch": 4.478527607361963, + "grad_norm": 3.223794460296631, + "learning_rate": 2.9124870053862447e-06, + "loss": 0.1317, + "step": 730 + }, + { + "epoch": 4.484662576687117, + "grad_norm": 3.5533759593963623, + "learning_rate": 2.907733887354657e-06, + "loss": 0.2285, + "step": 731 + }, + { + "epoch": 4.49079754601227, + "grad_norm": 3.535673141479492, + "learning_rate": 2.9029792547117088e-06, + "loss": 0.096, + "step": 732 + }, + { + "epoch": 4.4969325153374236, + "grad_norm": 4.031703948974609, + "learning_rate": 2.898223125119461e-06, + "loss": 0.1505, + "step": 733 + }, + { + "epoch": 4.5030674846625764, + "grad_norm": 2.823413610458374, + "learning_rate": 2.893465516245534e-06, + "loss": 0.0327, + "step": 734 + }, + { + "epoch": 4.50920245398773, + "grad_norm": 3.516738176345825, + "learning_rate": 2.8887064457630453e-06, + "loss": 0.0743, + "step": 735 + }, + { + "epoch": 4.515337423312883, + "grad_norm": 3.5523500442504883, + "learning_rate": 2.8839459313505407e-06, + "loss": 0.1768, + "step": 736 + }, + { + "epoch": 4.521472392638037, + "grad_norm": 3.2433223724365234, + "learning_rate": 2.879183990691929e-06, + "loss": 0.1598, + "step": 737 + }, + { + "epoch": 4.52760736196319, + "grad_norm": 3.0156848430633545, + "learning_rate": 2.8744206414764185e-06, + "loss": 0.0829, + "step": 738 + }, + { + "epoch": 4.533742331288344, + "grad_norm": 4.359529495239258, + "learning_rate": 2.8696559013984488e-06, + "loss": 0.1169, + "step": 739 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 2.3862433433532715, + "learning_rate": 2.8648897881576274e-06, + "loss": 0.0962, + "step": 740 + }, + { + "epoch": 4.54601226993865, + "grad_norm": 2.7100136280059814, + "learning_rate": 2.8601223194586613e-06, + "loss": 0.1204, + "step": 741 + }, + { + "epoch": 4.552147239263804, + "grad_norm": 3.8116140365600586, + "learning_rate": 2.8553535130112935e-06, + "loss": 0.0685, + "step": 742 + }, + { + "epoch": 4.558282208588957, + "grad_norm": 2.9640142917633057, + "learning_rate": 2.850583386530235e-06, + "loss": 0.0692, + "step": 743 + }, + { + "epoch": 4.564417177914111, + "grad_norm": 3.264592170715332, + "learning_rate": 2.8458119577351035e-06, + "loss": 0.2128, + "step": 744 + }, + { + "epoch": 4.570552147239264, + "grad_norm": 3.230497360229492, + "learning_rate": 2.841039244350351e-06, + "loss": 0.2409, + "step": 745 + }, + { + "epoch": 4.576687116564417, + "grad_norm": 4.41513204574585, + "learning_rate": 2.8362652641052024e-06, + "loss": 0.1878, + "step": 746 + }, + { + "epoch": 4.58282208588957, + "grad_norm": 3.047248601913452, + "learning_rate": 2.83149003473359e-06, + "loss": 0.1303, + "step": 747 + }, + { + "epoch": 4.588957055214724, + "grad_norm": 2.399754047393799, + "learning_rate": 2.8267135739740836e-06, + "loss": 0.0577, + "step": 748 + }, + { + "epoch": 4.595092024539877, + "grad_norm": 4.608038425445557, + "learning_rate": 2.8219358995698307e-06, + "loss": 0.2329, + "step": 749 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 3.537644147872925, + "learning_rate": 2.8171570292684846e-06, + "loss": 0.1329, + "step": 750 + }, + { + "epoch": 4.6073619631901845, + "grad_norm": 2.8099827766418457, + "learning_rate": 2.8123769808221407e-06, + "loss": 0.1512, + "step": 751 + }, + { + "epoch": 4.613496932515337, + "grad_norm": 3.3169758319854736, + "learning_rate": 2.8075957719872724e-06, + "loss": 0.1267, + "step": 752 + }, + { + "epoch": 4.61963190184049, + "grad_norm": 3.578435182571411, + "learning_rate": 2.8028134205246633e-06, + "loss": 0.147, + "step": 753 + }, + { + "epoch": 4.625766871165644, + "grad_norm": 3.544437885284424, + "learning_rate": 2.7980299441993415e-06, + "loss": 0.0947, + "step": 754 + }, + { + "epoch": 4.631901840490798, + "grad_norm": 3.798776388168335, + "learning_rate": 2.793245360780512e-06, + "loss": 0.1498, + "step": 755 + }, + { + "epoch": 4.638036809815951, + "grad_norm": 3.634991407394409, + "learning_rate": 2.788459688041495e-06, + "loss": 0.2504, + "step": 756 + }, + { + "epoch": 4.644171779141105, + "grad_norm": 20.123680114746094, + "learning_rate": 2.783672943759655e-06, + "loss": 0.2091, + "step": 757 + }, + { + "epoch": 4.6503067484662575, + "grad_norm": 3.9357221126556396, + "learning_rate": 2.778885145716339e-06, + "loss": 0.2045, + "step": 758 + }, + { + "epoch": 4.656441717791411, + "grad_norm": 3.3035309314727783, + "learning_rate": 2.7740963116968063e-06, + "loss": 0.1416, + "step": 759 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 3.096985101699829, + "learning_rate": 2.7693064594901646e-06, + "loss": 0.0455, + "step": 760 + }, + { + "epoch": 4.668711656441718, + "grad_norm": 2.9855458736419678, + "learning_rate": 2.7645156068893075e-06, + "loss": 0.1496, + "step": 761 + }, + { + "epoch": 4.674846625766871, + "grad_norm": 3.9140093326568604, + "learning_rate": 2.759723771690839e-06, + "loss": 0.2061, + "step": 762 + }, + { + "epoch": 4.680981595092025, + "grad_norm": 3.590569496154785, + "learning_rate": 2.754930971695019e-06, + "loss": 0.1017, + "step": 763 + }, + { + "epoch": 4.6871165644171775, + "grad_norm": 3.527254581451416, + "learning_rate": 2.750137224705687e-06, + "loss": 0.1979, + "step": 764 + }, + { + "epoch": 4.693251533742331, + "grad_norm": 4.198459148406982, + "learning_rate": 2.745342548530202e-06, + "loss": 0.1667, + "step": 765 + }, + { + "epoch": 4.699386503067485, + "grad_norm": 2.0246167182922363, + "learning_rate": 2.7405469609793746e-06, + "loss": 0.0346, + "step": 766 + }, + { + "epoch": 4.705521472392638, + "grad_norm": 3.2045300006866455, + "learning_rate": 2.7357504798674004e-06, + "loss": 0.0596, + "step": 767 + }, + { + "epoch": 4.711656441717792, + "grad_norm": 2.736985921859741, + "learning_rate": 2.730953123011796e-06, + "loss": 0.0384, + "step": 768 + }, + { + "epoch": 4.717791411042945, + "grad_norm": 3.0621395111083984, + "learning_rate": 2.726154908233328e-06, + "loss": 0.0558, + "step": 769 + }, + { + "epoch": 4.723926380368098, + "grad_norm": 3.2280497550964355, + "learning_rate": 2.721355853355953e-06, + "loss": 0.2272, + "step": 770 + }, + { + "epoch": 4.730061349693251, + "grad_norm": 3.342226028442383, + "learning_rate": 2.716555976206748e-06, + "loss": 0.074, + "step": 771 + }, + { + "epoch": 4.736196319018405, + "grad_norm": 4.328624248504639, + "learning_rate": 2.7117552946158415e-06, + "loss": 0.1034, + "step": 772 + }, + { + "epoch": 4.742331288343558, + "grad_norm": 2.980215311050415, + "learning_rate": 2.706953826416353e-06, + "loss": 0.1199, + "step": 773 + }, + { + "epoch": 4.748466257668712, + "grad_norm": 2.622478485107422, + "learning_rate": 2.702151589444324e-06, + "loss": 0.0467, + "step": 774 + }, + { + "epoch": 4.754601226993865, + "grad_norm": 2.9958693981170654, + "learning_rate": 2.6973486015386507e-06, + "loss": 0.143, + "step": 775 + }, + { + "epoch": 4.7607361963190185, + "grad_norm": 4.548511505126953, + "learning_rate": 2.6925448805410197e-06, + "loss": 0.3594, + "step": 776 + }, + { + "epoch": 4.766871165644172, + "grad_norm": 3.3429481983184814, + "learning_rate": 2.6877404442958393e-06, + "loss": 0.1397, + "step": 777 + }, + { + "epoch": 4.773006134969325, + "grad_norm": 2.5820136070251465, + "learning_rate": 2.682935310650177e-06, + "loss": 0.054, + "step": 778 + }, + { + "epoch": 4.779141104294479, + "grad_norm": 4.047626495361328, + "learning_rate": 2.6781294974536886e-06, + "loss": 0.1284, + "step": 779 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 3.0227510929107666, + "learning_rate": 2.673323022558557e-06, + "loss": 0.1441, + "step": 780 + }, + { + "epoch": 4.791411042944786, + "grad_norm": 4.731313705444336, + "learning_rate": 2.6685159038194202e-06, + "loss": 0.2859, + "step": 781 + }, + { + "epoch": 4.7975460122699385, + "grad_norm": 3.880655288696289, + "learning_rate": 2.6637081590933096e-06, + "loss": 0.1524, + "step": 782 + }, + { + "epoch": 4.803680981595092, + "grad_norm": 2.375474452972412, + "learning_rate": 2.6588998062395803e-06, + "loss": 0.0338, + "step": 783 + }, + { + "epoch": 4.809815950920245, + "grad_norm": 3.3587446212768555, + "learning_rate": 2.6540908631198498e-06, + "loss": 0.0755, + "step": 784 + }, + { + "epoch": 4.815950920245399, + "grad_norm": 2.767686367034912, + "learning_rate": 2.6492813475979243e-06, + "loss": 0.0631, + "step": 785 + }, + { + "epoch": 4.822085889570552, + "grad_norm": 3.88670015335083, + "learning_rate": 2.6444712775397397e-06, + "loss": 0.0853, + "step": 786 + }, + { + "epoch": 4.828220858895706, + "grad_norm": 3.543276309967041, + "learning_rate": 2.639660670813288e-06, + "loss": 0.1895, + "step": 787 + }, + { + "epoch": 4.8343558282208585, + "grad_norm": 3.659323215484619, + "learning_rate": 2.6348495452885598e-06, + "loss": 0.1745, + "step": 788 + }, + { + "epoch": 4.840490797546012, + "grad_norm": 3.0955021381378174, + "learning_rate": 2.630037918837468e-06, + "loss": 0.0846, + "step": 789 + }, + { + "epoch": 4.846625766871165, + "grad_norm": 3.4473249912261963, + "learning_rate": 2.6252258093337892e-06, + "loss": 0.0808, + "step": 790 + }, + { + "epoch": 4.852760736196319, + "grad_norm": 3.937120199203491, + "learning_rate": 2.6204132346530936e-06, + "loss": 0.2054, + "step": 791 + }, + { + "epoch": 4.858895705521473, + "grad_norm": 4.052806854248047, + "learning_rate": 2.6156002126726788e-06, + "loss": 0.1679, + "step": 792 + }, + { + "epoch": 4.865030674846626, + "grad_norm": 2.6694889068603516, + "learning_rate": 2.6107867612715043e-06, + "loss": 0.0534, + "step": 793 + }, + { + "epoch": 4.871165644171779, + "grad_norm": 3.594649076461792, + "learning_rate": 2.6059728983301267e-06, + "loss": 0.0899, + "step": 794 + }, + { + "epoch": 4.877300613496932, + "grad_norm": 2.7796030044555664, + "learning_rate": 2.601158641730629e-06, + "loss": 0.0596, + "step": 795 + }, + { + "epoch": 4.883435582822086, + "grad_norm": 4.618961334228516, + "learning_rate": 2.5963440093565567e-06, + "loss": 0.3858, + "step": 796 + }, + { + "epoch": 4.889570552147239, + "grad_norm": 3.0783939361572266, + "learning_rate": 2.5915290190928518e-06, + "loss": 0.12, + "step": 797 + }, + { + "epoch": 4.895705521472393, + "grad_norm": 4.078456878662109, + "learning_rate": 2.586713688825786e-06, + "loss": 0.1278, + "step": 798 + }, + { + "epoch": 4.901840490797546, + "grad_norm": 2.9439120292663574, + "learning_rate": 2.5818980364428935e-06, + "loss": 0.0847, + "step": 799 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 5.140681743621826, + "learning_rate": 2.5770820798329055e-06, + "loss": 0.1718, + "step": 800 + }, + { + "epoch": 4.914110429447852, + "grad_norm": 3.450190305709839, + "learning_rate": 2.572265836885682e-06, + "loss": 0.0895, + "step": 801 + }, + { + "epoch": 4.920245398773006, + "grad_norm": 3.1145224571228027, + "learning_rate": 2.567449325492149e-06, + "loss": 0.0652, + "step": 802 + }, + { + "epoch": 4.92638036809816, + "grad_norm": 2.851768732070923, + "learning_rate": 2.5626325635442283e-06, + "loss": 0.0877, + "step": 803 + }, + { + "epoch": 4.932515337423313, + "grad_norm": 3.3392980098724365, + "learning_rate": 2.5578155689347716e-06, + "loss": 0.2028, + "step": 804 + }, + { + "epoch": 4.938650306748467, + "grad_norm": 3.012439250946045, + "learning_rate": 2.5529983595574964e-06, + "loss": 0.031, + "step": 805 + }, + { + "epoch": 4.9447852760736195, + "grad_norm": 2.7732717990875244, + "learning_rate": 2.548180953306918e-06, + "loss": 0.0415, + "step": 806 + }, + { + "epoch": 4.950920245398773, + "grad_norm": 3.0423903465270996, + "learning_rate": 2.5433633680782817e-06, + "loss": 0.1188, + "step": 807 + }, + { + "epoch": 4.957055214723926, + "grad_norm": 5.056387901306152, + "learning_rate": 2.538545621767498e-06, + "loss": 0.1703, + "step": 808 + }, + { + "epoch": 4.96319018404908, + "grad_norm": 4.052585124969482, + "learning_rate": 2.533727732271077e-06, + "loss": 0.1455, + "step": 809 + }, + { + "epoch": 4.969325153374233, + "grad_norm": 3.4507904052734375, + "learning_rate": 2.5289097174860593e-06, + "loss": 0.0617, + "step": 810 + }, + { + "epoch": 4.975460122699387, + "grad_norm": 2.908266305923462, + "learning_rate": 2.524091595309952e-06, + "loss": 0.1173, + "step": 811 + }, + { + "epoch": 4.9815950920245395, + "grad_norm": 2.5857458114624023, + "learning_rate": 2.519273383640661e-06, + "loss": 0.0538, + "step": 812 + }, + { + "epoch": 4.987730061349693, + "grad_norm": 3.3518428802490234, + "learning_rate": 2.5144551003764227e-06, + "loss": 0.211, + "step": 813 + }, + { + "epoch": 4.993865030674847, + "grad_norm": 3.137981653213501, + "learning_rate": 2.509636763415742e-06, + "loss": 0.0944, + "step": 814 + }, + { + "epoch": 5.0, + "grad_norm": 2.8854241371154785, + "learning_rate": 2.5048183906573227e-06, + "loss": 0.098, + "step": 815 + }, + { + "epoch": 5.006134969325154, + "grad_norm": 3.508527994155884, + "learning_rate": 2.5e-06, + "loss": 0.1102, + "step": 816 + }, + { + "epoch": 5.012269938650307, + "grad_norm": 2.448152542114258, + "learning_rate": 2.495181609342678e-06, + "loss": 0.0712, + "step": 817 + }, + { + "epoch": 5.0184049079754605, + "grad_norm": 3.105818748474121, + "learning_rate": 2.4903632365842587e-06, + "loss": 0.0414, + "step": 818 + }, + { + "epoch": 5.024539877300613, + "grad_norm": 3.8048601150512695, + "learning_rate": 2.4855448996235777e-06, + "loss": 0.0894, + "step": 819 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 3.259834051132202, + "learning_rate": 2.48072661635934e-06, + "loss": 0.0796, + "step": 820 + }, + { + "epoch": 5.03680981595092, + "grad_norm": 2.822364568710327, + "learning_rate": 2.475908404690049e-06, + "loss": 0.0349, + "step": 821 + }, + { + "epoch": 5.042944785276074, + "grad_norm": 4.78808069229126, + "learning_rate": 2.4710902825139415e-06, + "loss": 0.2529, + "step": 822 + }, + { + "epoch": 5.049079754601227, + "grad_norm": 3.5420572757720947, + "learning_rate": 2.466272267728924e-06, + "loss": 0.1405, + "step": 823 + }, + { + "epoch": 5.0552147239263805, + "grad_norm": 2.500713348388672, + "learning_rate": 2.461454378232503e-06, + "loss": 0.0408, + "step": 824 + }, + { + "epoch": 5.061349693251533, + "grad_norm": 3.266291618347168, + "learning_rate": 2.4566366319217196e-06, + "loss": 0.0338, + "step": 825 + }, + { + "epoch": 5.067484662576687, + "grad_norm": 4.071012020111084, + "learning_rate": 2.4518190466930837e-06, + "loss": 0.06, + "step": 826 + }, + { + "epoch": 5.07361963190184, + "grad_norm": 4.3747172355651855, + "learning_rate": 2.4470016404425045e-06, + "loss": 0.1184, + "step": 827 + }, + { + "epoch": 5.079754601226994, + "grad_norm": 3.92030668258667, + "learning_rate": 2.4421844310652296e-06, + "loss": 0.1369, + "step": 828 + }, + { + "epoch": 5.085889570552148, + "grad_norm": 3.3482303619384766, + "learning_rate": 2.437367436455773e-06, + "loss": 0.1166, + "step": 829 + }, + { + "epoch": 5.0920245398773005, + "grad_norm": 3.429368019104004, + "learning_rate": 2.4325506745078524e-06, + "loss": 0.1214, + "step": 830 + }, + { + "epoch": 5.098159509202454, + "grad_norm": 3.4915647506713867, + "learning_rate": 2.427734163114319e-06, + "loss": 0.0454, + "step": 831 + }, + { + "epoch": 5.104294478527607, + "grad_norm": 3.1721251010894775, + "learning_rate": 2.4229179201670954e-06, + "loss": 0.0431, + "step": 832 + }, + { + "epoch": 5.110429447852761, + "grad_norm": 2.552578926086426, + "learning_rate": 2.418101963557107e-06, + "loss": 0.0347, + "step": 833 + }, + { + "epoch": 5.116564417177914, + "grad_norm": 3.518169403076172, + "learning_rate": 2.413286311174214e-06, + "loss": 0.1555, + "step": 834 + }, + { + "epoch": 5.122699386503068, + "grad_norm": 2.4452908039093018, + "learning_rate": 2.4084709809071487e-06, + "loss": 0.035, + "step": 835 + }, + { + "epoch": 5.128834355828221, + "grad_norm": 3.5366528034210205, + "learning_rate": 2.403655990643444e-06, + "loss": 0.0798, + "step": 836 + }, + { + "epoch": 5.134969325153374, + "grad_norm": 2.300065040588379, + "learning_rate": 2.398841358269371e-06, + "loss": 0.0178, + "step": 837 + }, + { + "epoch": 5.141104294478527, + "grad_norm": 2.851393699645996, + "learning_rate": 2.3940271016698733e-06, + "loss": 0.0447, + "step": 838 + }, + { + "epoch": 5.147239263803681, + "grad_norm": 4.085958957672119, + "learning_rate": 2.3892132387284956e-06, + "loss": 0.1626, + "step": 839 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 3.4240522384643555, + "learning_rate": 2.384399787327322e-06, + "loss": 0.0914, + "step": 840 + }, + { + "epoch": 5.159509202453988, + "grad_norm": 4.111586570739746, + "learning_rate": 2.3795867653469072e-06, + "loss": 0.0784, + "step": 841 + }, + { + "epoch": 5.1656441717791415, + "grad_norm": 2.3306312561035156, + "learning_rate": 2.374774190666211e-06, + "loss": 0.0216, + "step": 842 + }, + { + "epoch": 5.171779141104294, + "grad_norm": 2.5006275177001953, + "learning_rate": 2.3699620811625327e-06, + "loss": 0.0516, + "step": 843 + }, + { + "epoch": 5.177914110429448, + "grad_norm": 3.1680967807769775, + "learning_rate": 2.365150454711441e-06, + "loss": 0.0517, + "step": 844 + }, + { + "epoch": 5.184049079754601, + "grad_norm": 1.817044734954834, + "learning_rate": 2.3603393291867122e-06, + "loss": 0.0264, + "step": 845 + }, + { + "epoch": 5.190184049079755, + "grad_norm": 4.445211887359619, + "learning_rate": 2.355528722460261e-06, + "loss": 0.1079, + "step": 846 + }, + { + "epoch": 5.196319018404908, + "grad_norm": 2.918304681777954, + "learning_rate": 2.350718652402076e-06, + "loss": 0.0633, + "step": 847 + }, + { + "epoch": 5.2024539877300615, + "grad_norm": 3.6307432651519775, + "learning_rate": 2.345909136880151e-06, + "loss": 0.1013, + "step": 848 + }, + { + "epoch": 5.208588957055214, + "grad_norm": 3.5696842670440674, + "learning_rate": 2.34110019376042e-06, + "loss": 0.0199, + "step": 849 + }, + { + "epoch": 5.214723926380368, + "grad_norm": 2.2214856147766113, + "learning_rate": 2.336291840906691e-06, + "loss": 0.0288, + "step": 850 + }, + { + "epoch": 5.220858895705521, + "grad_norm": 2.5375778675079346, + "learning_rate": 2.3314840961805806e-06, + "loss": 0.0142, + "step": 851 + }, + { + "epoch": 5.226993865030675, + "grad_norm": 3.0093517303466797, + "learning_rate": 2.326676977441444e-06, + "loss": 0.0911, + "step": 852 + }, + { + "epoch": 5.233128834355828, + "grad_norm": 2.7067151069641113, + "learning_rate": 2.3218705025463118e-06, + "loss": 0.0315, + "step": 853 + }, + { + "epoch": 5.2392638036809815, + "grad_norm": 3.1892940998077393, + "learning_rate": 2.3170646893498237e-06, + "loss": 0.1344, + "step": 854 + }, + { + "epoch": 5.245398773006135, + "grad_norm": 2.8909313678741455, + "learning_rate": 2.312259555704161e-06, + "loss": 0.034, + "step": 855 + }, + { + "epoch": 5.251533742331288, + "grad_norm": 5.097650051116943, + "learning_rate": 2.3074551194589816e-06, + "loss": 0.1889, + "step": 856 + }, + { + "epoch": 5.257668711656442, + "grad_norm": 3.8511006832122803, + "learning_rate": 2.3026513984613506e-06, + "loss": 0.0794, + "step": 857 + }, + { + "epoch": 5.263803680981595, + "grad_norm": 2.2874133586883545, + "learning_rate": 2.297848410555677e-06, + "loss": 0.0238, + "step": 858 + }, + { + "epoch": 5.269938650306749, + "grad_norm": 3.504723310470581, + "learning_rate": 2.293046173583648e-06, + "loss": 0.0369, + "step": 859 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 3.2108154296875, + "learning_rate": 2.28824470538416e-06, + "loss": 0.0677, + "step": 860 + }, + { + "epoch": 5.282208588957055, + "grad_norm": 2.2249386310577393, + "learning_rate": 2.2834440237932537e-06, + "loss": 0.0244, + "step": 861 + }, + { + "epoch": 5.288343558282208, + "grad_norm": 3.141784191131592, + "learning_rate": 2.2786441466440474e-06, + "loss": 0.0628, + "step": 862 + }, + { + "epoch": 5.294478527607362, + "grad_norm": 3.5597352981567383, + "learning_rate": 2.2738450917666727e-06, + "loss": 0.0914, + "step": 863 + }, + { + "epoch": 5.300613496932515, + "grad_norm": 2.991966962814331, + "learning_rate": 2.269046876988204e-06, + "loss": 0.0546, + "step": 864 + }, + { + "epoch": 5.306748466257669, + "grad_norm": 3.100776195526123, + "learning_rate": 2.2642495201325995e-06, + "loss": 0.0473, + "step": 865 + }, + { + "epoch": 5.3128834355828225, + "grad_norm": 2.541754722595215, + "learning_rate": 2.259453039020626e-06, + "loss": 0.0613, + "step": 866 + }, + { + "epoch": 5.319018404907975, + "grad_norm": 2.8117194175720215, + "learning_rate": 2.2546574514697985e-06, + "loss": 0.0533, + "step": 867 + }, + { + "epoch": 5.325153374233129, + "grad_norm": 2.5676379203796387, + "learning_rate": 2.249862775294313e-06, + "loss": 0.018, + "step": 868 + }, + { + "epoch": 5.331288343558282, + "grad_norm": 2.5297701358795166, + "learning_rate": 2.245069028304981e-06, + "loss": 0.0246, + "step": 869 + }, + { + "epoch": 5.337423312883436, + "grad_norm": 2.199498176574707, + "learning_rate": 2.240276228309161e-06, + "loss": 0.0551, + "step": 870 + }, + { + "epoch": 5.343558282208589, + "grad_norm": 2.5793557167053223, + "learning_rate": 2.2354843931106933e-06, + "loss": 0.0258, + "step": 871 + }, + { + "epoch": 5.3496932515337425, + "grad_norm": 3.352058172225952, + "learning_rate": 2.230693540509836e-06, + "loss": 0.0228, + "step": 872 + }, + { + "epoch": 5.355828220858895, + "grad_norm": 2.900599956512451, + "learning_rate": 2.225903688303195e-06, + "loss": 0.0586, + "step": 873 + }, + { + "epoch": 5.361963190184049, + "grad_norm": 3.3317267894744873, + "learning_rate": 2.221114854283662e-06, + "loss": 0.0733, + "step": 874 + }, + { + "epoch": 5.368098159509202, + "grad_norm": 2.79304575920105, + "learning_rate": 2.2163270562403453e-06, + "loss": 0.0251, + "step": 875 + }, + { + "epoch": 5.374233128834356, + "grad_norm": 3.8596227169036865, + "learning_rate": 2.211540311958506e-06, + "loss": 0.0957, + "step": 876 + }, + { + "epoch": 5.38036809815951, + "grad_norm": 2.7464358806610107, + "learning_rate": 2.2067546392194888e-06, + "loss": 0.0457, + "step": 877 + }, + { + "epoch": 5.386503067484663, + "grad_norm": 2.3359906673431396, + "learning_rate": 2.2019700558006598e-06, + "loss": 0.0218, + "step": 878 + }, + { + "epoch": 5.392638036809816, + "grad_norm": 3.2412452697753906, + "learning_rate": 2.197186579475337e-06, + "loss": 0.0494, + "step": 879 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 3.930197238922119, + "learning_rate": 2.1924042280127284e-06, + "loss": 0.0803, + "step": 880 + }, + { + "epoch": 5.404907975460123, + "grad_norm": 2.5752930641174316, + "learning_rate": 2.1876230191778598e-06, + "loss": 0.0356, + "step": 881 + }, + { + "epoch": 5.411042944785276, + "grad_norm": 5.507393836975098, + "learning_rate": 2.182842970731516e-06, + "loss": 0.1245, + "step": 882 + }, + { + "epoch": 5.41717791411043, + "grad_norm": 2.416719436645508, + "learning_rate": 2.17806410043017e-06, + "loss": 0.0224, + "step": 883 + }, + { + "epoch": 5.423312883435583, + "grad_norm": 2.500429630279541, + "learning_rate": 2.173286426025917e-06, + "loss": 0.0499, + "step": 884 + }, + { + "epoch": 5.429447852760736, + "grad_norm": 2.8843860626220703, + "learning_rate": 2.168509965266411e-06, + "loss": 0.075, + "step": 885 + }, + { + "epoch": 5.435582822085889, + "grad_norm": 2.3187198638916016, + "learning_rate": 2.1637347358947984e-06, + "loss": 0.065, + "step": 886 + }, + { + "epoch": 5.441717791411043, + "grad_norm": 2.7135889530181885, + "learning_rate": 2.15896075564965e-06, + "loss": 0.0848, + "step": 887 + }, + { + "epoch": 5.447852760736196, + "grad_norm": 1.751846194267273, + "learning_rate": 2.1541880422648978e-06, + "loss": 0.0112, + "step": 888 + }, + { + "epoch": 5.45398773006135, + "grad_norm": 3.113271713256836, + "learning_rate": 2.1494166134697655e-06, + "loss": 0.077, + "step": 889 + }, + { + "epoch": 5.460122699386503, + "grad_norm": 2.711318016052246, + "learning_rate": 2.1446464869887077e-06, + "loss": 0.03, + "step": 890 + }, + { + "epoch": 5.466257668711656, + "grad_norm": 1.8012003898620605, + "learning_rate": 2.13987768054134e-06, + "loss": 0.0141, + "step": 891 + }, + { + "epoch": 5.47239263803681, + "grad_norm": 2.0968120098114014, + "learning_rate": 2.135110211842374e-06, + "loss": 0.0147, + "step": 892 + }, + { + "epoch": 5.478527607361963, + "grad_norm": 3.1689956188201904, + "learning_rate": 2.1303440986015525e-06, + "loss": 0.1123, + "step": 893 + }, + { + "epoch": 5.484662576687117, + "grad_norm": 4.512697219848633, + "learning_rate": 2.1255793585235827e-06, + "loss": 0.0359, + "step": 894 + }, + { + "epoch": 5.49079754601227, + "grad_norm": 3.5739688873291016, + "learning_rate": 2.120816009308071e-06, + "loss": 0.0635, + "step": 895 + }, + { + "epoch": 5.4969325153374236, + "grad_norm": 4.556554317474365, + "learning_rate": 2.1160540686494597e-06, + "loss": 0.1104, + "step": 896 + }, + { + "epoch": 5.5030674846625764, + "grad_norm": 2.2047064304351807, + "learning_rate": 2.1112935542369546e-06, + "loss": 0.0187, + "step": 897 + }, + { + "epoch": 5.50920245398773, + "grad_norm": 3.0289857387542725, + "learning_rate": 2.106534483754466e-06, + "loss": 0.0874, + "step": 898 + }, + { + "epoch": 5.515337423312883, + "grad_norm": 2.7090444564819336, + "learning_rate": 2.1017768748805396e-06, + "loss": 0.0301, + "step": 899 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 3.0662643909454346, + "learning_rate": 2.0970207452882917e-06, + "loss": 0.1192, + "step": 900 + }, + { + "epoch": 5.52760736196319, + "grad_norm": 2.869401454925537, + "learning_rate": 2.0922661126453436e-06, + "loss": 0.0803, + "step": 901 + }, + { + "epoch": 5.533742331288344, + "grad_norm": 2.229947328567505, + "learning_rate": 2.0875129946137557e-06, + "loss": 0.0186, + "step": 902 + }, + { + "epoch": 5.539877300613497, + "grad_norm": 3.3460421562194824, + "learning_rate": 2.0827614088499624e-06, + "loss": 0.0499, + "step": 903 + }, + { + "epoch": 5.54601226993865, + "grad_norm": 1.9324007034301758, + "learning_rate": 2.0780113730047056e-06, + "loss": 0.0322, + "step": 904 + }, + { + "epoch": 5.552147239263804, + "grad_norm": 2.761482000350952, + "learning_rate": 2.0732629047229712e-06, + "loss": 0.0265, + "step": 905 + }, + { + "epoch": 5.558282208588957, + "grad_norm": 2.4173266887664795, + "learning_rate": 2.0685160216439205e-06, + "loss": 0.0229, + "step": 906 + }, + { + "epoch": 5.564417177914111, + "grad_norm": 2.503661632537842, + "learning_rate": 2.0637707414008267e-06, + "loss": 0.0266, + "step": 907 + }, + { + "epoch": 5.570552147239264, + "grad_norm": 2.312236785888672, + "learning_rate": 2.0590270816210077e-06, + "loss": 0.018, + "step": 908 + }, + { + "epoch": 5.576687116564417, + "grad_norm": 2.569575548171997, + "learning_rate": 2.0542850599257647e-06, + "loss": 0.0377, + "step": 909 + }, + { + "epoch": 5.58282208588957, + "grad_norm": 3.520341157913208, + "learning_rate": 2.0495446939303122e-06, + "loss": 0.1224, + "step": 910 + }, + { + "epoch": 5.588957055214724, + "grad_norm": 3.231363296508789, + "learning_rate": 2.044806001243714e-06, + "loss": 0.1457, + "step": 911 + }, + { + "epoch": 5.595092024539877, + "grad_norm": 3.3211300373077393, + "learning_rate": 2.040068999468818e-06, + "loss": 0.0429, + "step": 912 + }, + { + "epoch": 5.601226993865031, + "grad_norm": 3.3712961673736572, + "learning_rate": 2.035333706202192e-06, + "loss": 0.0634, + "step": 913 + }, + { + "epoch": 5.6073619631901845, + "grad_norm": 2.480177402496338, + "learning_rate": 2.0306001390340565e-06, + "loss": 0.0178, + "step": 914 + }, + { + "epoch": 5.613496932515337, + "grad_norm": 2.9777421951293945, + "learning_rate": 2.02586831554822e-06, + "loss": 0.037, + "step": 915 + }, + { + "epoch": 5.61963190184049, + "grad_norm": 2.9129085540771484, + "learning_rate": 2.021138253322012e-06, + "loss": 0.125, + "step": 916 + }, + { + "epoch": 5.625766871165644, + "grad_norm": 4.041767597198486, + "learning_rate": 2.016409969926224e-06, + "loss": 0.1897, + "step": 917 + }, + { + "epoch": 5.631901840490798, + "grad_norm": 4.088902950286865, + "learning_rate": 2.0116834829250355e-06, + "loss": 0.0546, + "step": 918 + }, + { + "epoch": 5.638036809815951, + "grad_norm": 3.8629167079925537, + "learning_rate": 2.0069588098759545e-06, + "loss": 0.0911, + "step": 919 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 2.616830825805664, + "learning_rate": 2.00223596832975e-06, + "loss": 0.0527, + "step": 920 + }, + { + "epoch": 5.6503067484662575, + "grad_norm": 1.9370782375335693, + "learning_rate": 1.9975149758303885e-06, + "loss": 0.0384, + "step": 921 + }, + { + "epoch": 5.656441717791411, + "grad_norm": 3.7839455604553223, + "learning_rate": 1.992795849914967e-06, + "loss": 0.1033, + "step": 922 + }, + { + "epoch": 5.662576687116564, + "grad_norm": 3.870729923248291, + "learning_rate": 1.9880786081136498e-06, + "loss": 0.08, + "step": 923 + }, + { + "epoch": 5.668711656441718, + "grad_norm": 3.4394288063049316, + "learning_rate": 1.9833632679496008e-06, + "loss": 0.0819, + "step": 924 + }, + { + "epoch": 5.674846625766871, + "grad_norm": 3.1659159660339355, + "learning_rate": 1.97864984693892e-06, + "loss": 0.117, + "step": 925 + }, + { + "epoch": 5.680981595092025, + "grad_norm": 2.2375190258026123, + "learning_rate": 1.97393836259058e-06, + "loss": 0.0215, + "step": 926 + }, + { + "epoch": 5.6871165644171775, + "grad_norm": 3.9375314712524414, + "learning_rate": 1.969228832406358e-06, + "loss": 0.1422, + "step": 927 + }, + { + "epoch": 5.693251533742331, + "grad_norm": 3.1969058513641357, + "learning_rate": 1.964521273880772e-06, + "loss": 0.0538, + "step": 928 + }, + { + "epoch": 5.699386503067485, + "grad_norm": 3.5990066528320312, + "learning_rate": 1.9598157045010162e-06, + "loss": 0.114, + "step": 929 + }, + { + "epoch": 5.705521472392638, + "grad_norm": 3.1764235496520996, + "learning_rate": 1.9551121417468955e-06, + "loss": 0.053, + "step": 930 + }, + { + "epoch": 5.711656441717792, + "grad_norm": 4.1162309646606445, + "learning_rate": 1.9504106030907605e-06, + "loss": 0.0866, + "step": 931 + }, + { + "epoch": 5.717791411042945, + "grad_norm": 3.543071985244751, + "learning_rate": 1.945711105997444e-06, + "loss": 0.0908, + "step": 932 + }, + { + "epoch": 5.723926380368098, + "grad_norm": 4.136870384216309, + "learning_rate": 1.941013667924194e-06, + "loss": 0.0612, + "step": 933 + }, + { + "epoch": 5.730061349693251, + "grad_norm": 1.7658357620239258, + "learning_rate": 1.9363183063206097e-06, + "loss": 0.0283, + "step": 934 + }, + { + "epoch": 5.736196319018405, + "grad_norm": 3.9701411724090576, + "learning_rate": 1.931625038628577e-06, + "loss": 0.0948, + "step": 935 + }, + { + "epoch": 5.742331288343558, + "grad_norm": 3.0636157989501953, + "learning_rate": 1.9269338822822047e-06, + "loss": 0.0769, + "step": 936 + }, + { + "epoch": 5.748466257668712, + "grad_norm": 3.3671388626098633, + "learning_rate": 1.9222448547077573e-06, + "loss": 0.098, + "step": 937 + }, + { + "epoch": 5.754601226993865, + "grad_norm": 3.0725975036621094, + "learning_rate": 1.917557973323591e-06, + "loss": 0.0363, + "step": 938 + }, + { + "epoch": 5.7607361963190185, + "grad_norm": 2.5592041015625, + "learning_rate": 1.9128732555400915e-06, + "loss": 0.0205, + "step": 939 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 2.835740804672241, + "learning_rate": 1.9081907187596054e-06, + "loss": 0.0548, + "step": 940 + }, + { + "epoch": 5.773006134969325, + "grad_norm": 3.3596746921539307, + "learning_rate": 1.9035103803763793e-06, + "loss": 0.0454, + "step": 941 + }, + { + "epoch": 5.779141104294479, + "grad_norm": 3.226579427719116, + "learning_rate": 1.8988322577764918e-06, + "loss": 0.0514, + "step": 942 + }, + { + "epoch": 5.785276073619632, + "grad_norm": 3.2044687271118164, + "learning_rate": 1.8941563683377905e-06, + "loss": 0.1361, + "step": 943 + }, + { + "epoch": 5.791411042944786, + "grad_norm": 1.8300527334213257, + "learning_rate": 1.8894827294298296e-06, + "loss": 0.0139, + "step": 944 + }, + { + "epoch": 5.7975460122699385, + "grad_norm": 2.503735303878784, + "learning_rate": 1.884811358413801e-06, + "loss": 0.0311, + "step": 945 + }, + { + "epoch": 5.803680981595092, + "grad_norm": 2.171309471130371, + "learning_rate": 1.8801422726424735e-06, + "loss": 0.0227, + "step": 946 + }, + { + "epoch": 5.809815950920245, + "grad_norm": 1.8116636276245117, + "learning_rate": 1.8754754894601252e-06, + "loss": 0.0157, + "step": 947 + }, + { + "epoch": 5.815950920245399, + "grad_norm": 3.1412570476531982, + "learning_rate": 1.870811026202482e-06, + "loss": 0.1093, + "step": 948 + }, + { + "epoch": 5.822085889570552, + "grad_norm": 2.3962290287017822, + "learning_rate": 1.8661489001966526e-06, + "loss": 0.021, + "step": 949 + }, + { + "epoch": 5.828220858895706, + "grad_norm": 4.169166564941406, + "learning_rate": 1.8614891287610621e-06, + "loss": 0.0663, + "step": 950 + }, + { + "epoch": 5.8343558282208585, + "grad_norm": 3.1181528568267822, + "learning_rate": 1.8568317292053894e-06, + "loss": 0.1008, + "step": 951 + }, + { + "epoch": 5.840490797546012, + "grad_norm": 3.5155029296875, + "learning_rate": 1.8521767188305023e-06, + "loss": 0.0451, + "step": 952 + }, + { + "epoch": 5.846625766871165, + "grad_norm": 2.975693702697754, + "learning_rate": 1.8475241149283957e-06, + "loss": 0.0561, + "step": 953 + }, + { + "epoch": 5.852760736196319, + "grad_norm": 2.1581289768218994, + "learning_rate": 1.842873934782122e-06, + "loss": 0.0265, + "step": 954 + }, + { + "epoch": 5.858895705521473, + "grad_norm": 2.6281228065490723, + "learning_rate": 1.8382261956657318e-06, + "loss": 0.1196, + "step": 955 + }, + { + "epoch": 5.865030674846626, + "grad_norm": 2.9569528102874756, + "learning_rate": 1.8335809148442074e-06, + "loss": 0.1356, + "step": 956 + }, + { + "epoch": 5.871165644171779, + "grad_norm": 2.450949192047119, + "learning_rate": 1.8289381095734005e-06, + "loss": 0.0444, + "step": 957 + }, + { + "epoch": 5.877300613496932, + "grad_norm": 2.1737027168273926, + "learning_rate": 1.8242977970999643e-06, + "loss": 0.0622, + "step": 958 + }, + { + "epoch": 5.883435582822086, + "grad_norm": 3.350647211074829, + "learning_rate": 1.8196599946612956e-06, + "loss": 0.0762, + "step": 959 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 2.5031936168670654, + "learning_rate": 1.8150247194854642e-06, + "loss": 0.0207, + "step": 960 + }, + { + "epoch": 5.895705521472393, + "grad_norm": 3.7103707790374756, + "learning_rate": 1.8103919887911525e-06, + "loss": 0.1122, + "step": 961 + }, + { + "epoch": 5.901840490797546, + "grad_norm": 2.485322952270508, + "learning_rate": 1.8057618197875914e-06, + "loss": 0.0284, + "step": 962 + }, + { + "epoch": 5.9079754601226995, + "grad_norm": 1.903212547302246, + "learning_rate": 1.8011342296744961e-06, + "loss": 0.0239, + "step": 963 + }, + { + "epoch": 5.914110429447852, + "grad_norm": 3.015552520751953, + "learning_rate": 1.796509235642001e-06, + "loss": 0.0425, + "step": 964 + }, + { + "epoch": 5.920245398773006, + "grad_norm": 4.806198596954346, + "learning_rate": 1.7918868548705982e-06, + "loss": 0.2094, + "step": 965 + }, + { + "epoch": 5.92638036809816, + "grad_norm": 2.949596643447876, + "learning_rate": 1.7872671045310703e-06, + "loss": 0.0632, + "step": 966 + }, + { + "epoch": 5.932515337423313, + "grad_norm": 4.153099536895752, + "learning_rate": 1.782650001784431e-06, + "loss": 0.1411, + "step": 967 + }, + { + "epoch": 5.938650306748467, + "grad_norm": 3.4117565155029297, + "learning_rate": 1.7780355637818568e-06, + "loss": 0.0965, + "step": 968 + }, + { + "epoch": 5.9447852760736195, + "grad_norm": 2.533405303955078, + "learning_rate": 1.7734238076646277e-06, + "loss": 0.0568, + "step": 969 + }, + { + "epoch": 5.950920245398773, + "grad_norm": 2.3604726791381836, + "learning_rate": 1.7688147505640581e-06, + "loss": 0.0182, + "step": 970 + }, + { + "epoch": 5.957055214723926, + "grad_norm": 3.807424306869507, + "learning_rate": 1.7642084096014405e-06, + "loss": 0.0547, + "step": 971 + }, + { + "epoch": 5.96319018404908, + "grad_norm": 2.5735342502593994, + "learning_rate": 1.759604801887974e-06, + "loss": 0.0775, + "step": 972 + }, + { + "epoch": 5.969325153374233, + "grad_norm": 2.9217734336853027, + "learning_rate": 1.7550039445247069e-06, + "loss": 0.0541, + "step": 973 + }, + { + "epoch": 5.975460122699387, + "grad_norm": 2.793104410171509, + "learning_rate": 1.7504058546024694e-06, + "loss": 0.0257, + "step": 974 + }, + { + "epoch": 5.9815950920245395, + "grad_norm": 3.5610134601593018, + "learning_rate": 1.7458105492018114e-06, + "loss": 0.0767, + "step": 975 + }, + { + "epoch": 5.987730061349693, + "grad_norm": 2.0738015174865723, + "learning_rate": 1.7412180453929412e-06, + "loss": 0.025, + "step": 976 + }, + { + "epoch": 5.993865030674847, + "grad_norm": 2.1248421669006348, + "learning_rate": 1.736628360235657e-06, + "loss": 0.0183, + "step": 977 + }, + { + "epoch": 6.0, + "grad_norm": 2.901273727416992, + "learning_rate": 1.7320415107792893e-06, + "loss": 0.1369, + "step": 978 + } + ], + "logging_steps": 1, + "max_steps": 1630, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4215941729694515e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ee5ec43a657370cb1c978cde27484c565f4a94d6 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json @@ -0,0 +1,7804 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 1110, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005405405405405406, + "grad_norm": 72.60939025878906, + "learning_rate": 5e-06, + "loss": 2.9165, + "step": 1 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 29.01830291748047, + "learning_rate": 4.999996395324314e-06, + "loss": 1.9314, + "step": 2 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 21.44908332824707, + "learning_rate": 4.99998558130765e-06, + "loss": 1.5709, + "step": 3 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 4.490907669067383, + "learning_rate": 4.999967557981192e-06, + "loss": 0.8099, + "step": 4 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 4.000796794891357, + "learning_rate": 4.999942325396917e-06, + "loss": 0.9021, + "step": 5 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 18.513282775878906, + "learning_rate": 4.999909883627588e-06, + "loss": 1.7972, + "step": 6 + }, + { + "epoch": 0.03783783783783784, + "grad_norm": 3.5735981464385986, + "learning_rate": 4.999870232766757e-06, + "loss": 1.4306, + "step": 7 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 3.1145193576812744, + "learning_rate": 4.9998233729287696e-06, + "loss": 1.051, + "step": 8 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 3.856376886367798, + "learning_rate": 4.999769304248755e-06, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 4.05589485168457, + "learning_rate": 4.9997080268826344e-06, + "loss": 1.0999, + "step": 10 + }, + { + "epoch": 0.05945945945945946, + "grad_norm": 13.784229278564453, + "learning_rate": 4.9996395410071165e-06, + "loss": 1.2831, + "step": 11 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 6.079237937927246, + "learning_rate": 4.999563846819696e-06, + "loss": 1.2874, + "step": 12 + }, + { + "epoch": 0.07027027027027027, + "grad_norm": 4.5971245765686035, + "learning_rate": 4.999480944538655e-06, + "loss": 0.96, + "step": 13 + }, + { + "epoch": 0.07567567567567568, + "grad_norm": 4.916017532348633, + "learning_rate": 4.999390834403063e-06, + "loss": 0.9869, + "step": 14 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 3.2311055660247803, + "learning_rate": 4.999293516672773e-06, + "loss": 0.9293, + "step": 15 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 3.3040921688079834, + "learning_rate": 4.9991889916284255e-06, + "loss": 0.8914, + "step": 16 + }, + { + "epoch": 0.0918918918918919, + "grad_norm": 3.794267416000366, + "learning_rate": 4.999077259571442e-06, + "loss": 1.0176, + "step": 17 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 4.788509845733643, + "learning_rate": 4.998958320824031e-06, + "loss": 1.0259, + "step": 18 + }, + { + "epoch": 0.10270270270270271, + "grad_norm": 10.027527809143066, + "learning_rate": 4.998832175729179e-06, + "loss": 1.3356, + "step": 19 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 4.612483978271484, + "learning_rate": 4.998698824650656e-06, + "loss": 1.4486, + "step": 20 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 3.8676936626434326, + "learning_rate": 4.998558267973014e-06, + "loss": 0.8372, + "step": 21 + }, + { + "epoch": 0.11891891891891893, + "grad_norm": 2.9611001014709473, + "learning_rate": 4.998410506101579e-06, + "loss": 0.7931, + "step": 22 + }, + { + "epoch": 0.12432432432432433, + "grad_norm": 5.508745193481445, + "learning_rate": 4.9982555394624595e-06, + "loss": 1.3022, + "step": 23 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 3.434845209121704, + "learning_rate": 4.998093368502539e-06, + "loss": 0.9739, + "step": 24 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 4.736802101135254, + "learning_rate": 4.9979239936894765e-06, + "loss": 1.1154, + "step": 25 + }, + { + "epoch": 0.14054054054054055, + "grad_norm": 3.69411039352417, + "learning_rate": 4.997747415511705e-06, + "loss": 0.7543, + "step": 26 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 2.8646645545959473, + "learning_rate": 4.997563634478428e-06, + "loss": 0.7278, + "step": 27 + }, + { + "epoch": 0.15135135135135136, + "grad_norm": 6.56904935836792, + "learning_rate": 4.997372651119626e-06, + "loss": 0.8167, + "step": 28 + }, + { + "epoch": 0.15675675675675677, + "grad_norm": 2.955914258956909, + "learning_rate": 4.997174465986044e-06, + "loss": 0.8031, + "step": 29 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 2.5714259147644043, + "learning_rate": 4.996969079649196e-06, + "loss": 0.689, + "step": 30 + }, + { + "epoch": 0.16756756756756758, + "grad_norm": 3.5165364742279053, + "learning_rate": 4.996756492701362e-06, + "loss": 0.8059, + "step": 31 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 3.2861921787261963, + "learning_rate": 4.996536705755591e-06, + "loss": 0.9658, + "step": 32 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 2.962470531463623, + "learning_rate": 4.996309719445687e-06, + "loss": 0.8349, + "step": 33 + }, + { + "epoch": 0.1837837837837838, + "grad_norm": 2.7694804668426514, + "learning_rate": 4.996075534426223e-06, + "loss": 0.8287, + "step": 34 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 3.405071258544922, + "learning_rate": 4.995834151372526e-06, + "loss": 1.1211, + "step": 35 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 2.8680710792541504, + "learning_rate": 4.995585570980685e-06, + "loss": 1.0841, + "step": 36 + }, + { + "epoch": 0.2, + "grad_norm": 3.341021776199341, + "learning_rate": 4.995329793967537e-06, + "loss": 0.6182, + "step": 37 + }, + { + "epoch": 0.20540540540540542, + "grad_norm": 3.0639379024505615, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.7647, + "step": 38 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 3.225759983062744, + "learning_rate": 4.994796653048457e-06, + "loss": 0.8691, + "step": 39 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 4.56926155090332, + "learning_rate": 4.994519290679965e-06, + "loss": 1.0404, + "step": 40 + }, + { + "epoch": 0.22162162162162163, + "grad_norm": 4.871571063995361, + "learning_rate": 4.994234734765043e-06, + "loss": 1.1877, + "step": 41 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 3.672215700149536, + "learning_rate": 4.993942986124278e-06, + "loss": 0.959, + "step": 42 + }, + { + "epoch": 0.23243243243243245, + "grad_norm": 3.184683322906494, + "learning_rate": 4.9936440455989975e-06, + "loss": 0.9249, + "step": 43 + }, + { + "epoch": 0.23783783783783785, + "grad_norm": 2.7092034816741943, + "learning_rate": 4.993337914051266e-06, + "loss": 0.6899, + "step": 44 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 3.153764486312866, + "learning_rate": 4.99302459236389e-06, + "loss": 0.9075, + "step": 45 + }, + { + "epoch": 0.24864864864864866, + "grad_norm": 3.3629748821258545, + "learning_rate": 4.992704081440407e-06, + "loss": 0.785, + "step": 46 + }, + { + "epoch": 0.25405405405405407, + "grad_norm": 4.478365898132324, + "learning_rate": 4.992376382205088e-06, + "loss": 1.008, + "step": 47 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 3.4001641273498535, + "learning_rate": 4.992041495602932e-06, + "loss": 0.7751, + "step": 48 + }, + { + "epoch": 0.2648648648648649, + "grad_norm": 2.522662878036499, + "learning_rate": 4.991699422599664e-06, + "loss": 0.9022, + "step": 49 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.764458179473877, + "learning_rate": 4.991350164181735e-06, + "loss": 0.8801, + "step": 50 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 2.814859628677368, + "learning_rate": 4.990993721356317e-06, + "loss": 0.7045, + "step": 51 + }, + { + "epoch": 0.2810810810810811, + "grad_norm": 2.441311836242676, + "learning_rate": 4.990630095151296e-06, + "loss": 0.7312, + "step": 52 + }, + { + "epoch": 0.2864864864864865, + "grad_norm": 2.4443013668060303, + "learning_rate": 4.9902592866152765e-06, + "loss": 0.9609, + "step": 53 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 2.2934701442718506, + "learning_rate": 4.989881296817575e-06, + "loss": 0.5753, + "step": 54 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 2.6286847591400146, + "learning_rate": 4.989496126848215e-06, + "loss": 0.5118, + "step": 55 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 3.6817069053649902, + "learning_rate": 4.989103777817928e-06, + "loss": 1.1261, + "step": 56 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 3.011197566986084, + "learning_rate": 4.988704250858145e-06, + "loss": 0.7823, + "step": 57 + }, + { + "epoch": 0.31351351351351353, + "grad_norm": 2.5490806102752686, + "learning_rate": 4.988297547121e-06, + "loss": 0.6019, + "step": 58 + }, + { + "epoch": 0.31891891891891894, + "grad_norm": 3.0803146362304688, + "learning_rate": 4.98788366777932e-06, + "loss": 0.825, + "step": 59 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 3.015730619430542, + "learning_rate": 4.987462614026625e-06, + "loss": 0.7667, + "step": 60 + }, + { + "epoch": 0.32972972972972975, + "grad_norm": 2.5371594429016113, + "learning_rate": 4.987034387077126e-06, + "loss": 0.8051, + "step": 61 + }, + { + "epoch": 0.33513513513513515, + "grad_norm": 2.6414010524749756, + "learning_rate": 4.986598988165718e-06, + "loss": 0.6895, + "step": 62 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 3.065131187438965, + "learning_rate": 4.9861564185479785e-06, + "loss": 0.9268, + "step": 63 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 2.5708694458007812, + "learning_rate": 4.985706679500163e-06, + "loss": 0.9854, + "step": 64 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 2.768915891647339, + "learning_rate": 4.9852497723192025e-06, + "loss": 0.8083, + "step": 65 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 2.567901849746704, + "learning_rate": 4.9847856983227e-06, + "loss": 0.9098, + "step": 66 + }, + { + "epoch": 0.3621621621621622, + "grad_norm": 2.5766549110412598, + "learning_rate": 4.984314458848923e-06, + "loss": 0.8881, + "step": 67 + }, + { + "epoch": 0.3675675675675676, + "grad_norm": 2.9778389930725098, + "learning_rate": 4.983836055256804e-06, + "loss": 0.9877, + "step": 68 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 2.7225165367126465, + "learning_rate": 4.983350488925935e-06, + "loss": 0.8282, + "step": 69 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 2.702287197113037, + "learning_rate": 4.982857761256564e-06, + "loss": 1.1756, + "step": 70 + }, + { + "epoch": 0.3837837837837838, + "grad_norm": 2.9815568923950195, + "learning_rate": 4.982357873669589e-06, + "loss": 0.8114, + "step": 71 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 3.27150297164917, + "learning_rate": 4.981850827606556e-06, + "loss": 0.6763, + "step": 72 + }, + { + "epoch": 0.3945945945945946, + "grad_norm": 2.568423271179199, + "learning_rate": 4.981336624529655e-06, + "loss": 0.9372, + "step": 73 + }, + { + "epoch": 0.4, + "grad_norm": 2.621175527572632, + "learning_rate": 4.980815265921714e-06, + "loss": 1.0155, + "step": 74 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 2.62827205657959, + "learning_rate": 4.980286753286196e-06, + "loss": 0.949, + "step": 75 + }, + { + "epoch": 0.41081081081081083, + "grad_norm": 2.9462146759033203, + "learning_rate": 4.979751088147192e-06, + "loss": 1.0134, + "step": 76 + }, + { + "epoch": 0.41621621621621624, + "grad_norm": 2.814852714538574, + "learning_rate": 4.979208272049425e-06, + "loss": 0.9722, + "step": 77 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 4.177679538726807, + "learning_rate": 4.978658306558235e-06, + "loss": 1.2259, + "step": 78 + }, + { + "epoch": 0.42702702702702705, + "grad_norm": 2.813084125518799, + "learning_rate": 4.978101193259578e-06, + "loss": 0.834, + "step": 79 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.71824049949646, + "learning_rate": 4.977536933760025e-06, + "loss": 0.6151, + "step": 80 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 4.992153167724609, + "learning_rate": 4.976965529686755e-06, + "loss": 1.0475, + "step": 81 + }, + { + "epoch": 0.44324324324324327, + "grad_norm": 2.4810822010040283, + "learning_rate": 4.976386982687548e-06, + "loss": 0.8324, + "step": 82 + }, + { + "epoch": 0.4486486486486487, + "grad_norm": 4.509149074554443, + "learning_rate": 4.9758012944307845e-06, + "loss": 0.997, + "step": 83 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 3.114325761795044, + "learning_rate": 4.975208466605436e-06, + "loss": 1.2024, + "step": 84 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 3.297091007232666, + "learning_rate": 4.974608500921064e-06, + "loss": 0.9146, + "step": 85 + }, + { + "epoch": 0.4648648648648649, + "grad_norm": 2.824475049972534, + "learning_rate": 4.974001399107816e-06, + "loss": 0.7181, + "step": 86 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 20.262290954589844, + "learning_rate": 4.973387162916415e-06, + "loss": 0.8599, + "step": 87 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 4.015744686126709, + "learning_rate": 4.972765794118158e-06, + "loss": 0.6081, + "step": 88 + }, + { + "epoch": 0.4810810810810811, + "grad_norm": 2.8033058643341064, + "learning_rate": 4.9721372945049114e-06, + "loss": 0.8764, + "step": 89 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 5.271846294403076, + "learning_rate": 4.971501665889107e-06, + "loss": 0.8622, + "step": 90 + }, + { + "epoch": 0.4918918918918919, + "grad_norm": 2.557264804840088, + "learning_rate": 4.9708589101037306e-06, + "loss": 0.5523, + "step": 91 + }, + { + "epoch": 0.4972972972972973, + "grad_norm": 4.342173099517822, + "learning_rate": 4.970209029002325e-06, + "loss": 0.8922, + "step": 92 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 2.950364351272583, + "learning_rate": 4.969552024458977e-06, + "loss": 0.9455, + "step": 93 + }, + { + "epoch": 0.5081081081081081, + "grad_norm": 2.6453042030334473, + "learning_rate": 4.968887898368318e-06, + "loss": 0.8342, + "step": 94 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 3.486766815185547, + "learning_rate": 4.968216652645515e-06, + "loss": 0.8476, + "step": 95 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 2.884152889251709, + "learning_rate": 4.967538289226268e-06, + "loss": 0.8879, + "step": 96 + }, + { + "epoch": 0.5243243243243243, + "grad_norm": 2.4130594730377197, + "learning_rate": 4.966852810066798e-06, + "loss": 0.7114, + "step": 97 + }, + { + "epoch": 0.5297297297297298, + "grad_norm": 3.182410955429077, + "learning_rate": 4.9661602171438524e-06, + "loss": 0.6757, + "step": 98 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 2.5027542114257812, + "learning_rate": 4.965460512454687e-06, + "loss": 0.8029, + "step": 99 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.3096024990081787, + "learning_rate": 4.964753698017071e-06, + "loss": 0.842, + "step": 100 + }, + { + "epoch": 0.5459459459459459, + "grad_norm": 2.875657081604004, + "learning_rate": 4.964039775869271e-06, + "loss": 0.6339, + "step": 101 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 2.505406141281128, + "learning_rate": 4.963318748070056e-06, + "loss": 0.7743, + "step": 102 + }, + { + "epoch": 0.5567567567567567, + "grad_norm": 3.552562713623047, + "learning_rate": 4.9625906166986815e-06, + "loss": 0.926, + "step": 103 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 2.717942476272583, + "learning_rate": 4.961855383854889e-06, + "loss": 0.7037, + "step": 104 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 2.5049386024475098, + "learning_rate": 4.961113051658901e-06, + "loss": 0.561, + "step": 105 + }, + { + "epoch": 0.572972972972973, + "grad_norm": 2.3112900257110596, + "learning_rate": 4.96036362225141e-06, + "loss": 0.7316, + "step": 106 + }, + { + "epoch": 0.5783783783783784, + "grad_norm": 2.470257520675659, + "learning_rate": 4.959607097793575e-06, + "loss": 0.6426, + "step": 107 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 3.8040788173675537, + "learning_rate": 4.9588434804670176e-06, + "loss": 1.0044, + "step": 108 + }, + { + "epoch": 0.5891891891891892, + "grad_norm": 3.143547296524048, + "learning_rate": 4.958072772473812e-06, + "loss": 0.9219, + "step": 109 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 3.5052590370178223, + "learning_rate": 4.9572949760364795e-06, + "loss": 0.6056, + "step": 110 + }, + { + "epoch": 0.6, + "grad_norm": 3.064009428024292, + "learning_rate": 4.9565100933979835e-06, + "loss": 0.6346, + "step": 111 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 2.694610595703125, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.9856, + "step": 112 + }, + { + "epoch": 0.6108108108108108, + "grad_norm": 2.5885775089263916, + "learning_rate": 4.954919078591521e-06, + "loss": 0.8669, + "step": 113 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 2.593609571456909, + "learning_rate": 4.954112951011628e-06, + "loss": 0.7201, + "step": 114 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 3.3045759201049805, + "learning_rate": 4.9532997464067065e-06, + "loss": 0.9095, + "step": 115 + }, + { + "epoch": 0.6270270270270271, + "grad_norm": 2.8144869804382324, + "learning_rate": 4.952479467121828e-06, + "loss": 1.0213, + "step": 116 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 2.5460312366485596, + "learning_rate": 4.951652115522463e-06, + "loss": 1.1154, + "step": 117 + }, + { + "epoch": 0.6378378378378379, + "grad_norm": 2.795137405395508, + "learning_rate": 4.950817693994481e-06, + "loss": 0.691, + "step": 118 + }, + { + "epoch": 0.6432432432432432, + "grad_norm": 2.4979195594787598, + "learning_rate": 4.949976204944135e-06, + "loss": 0.7224, + "step": 119 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 3.3131983280181885, + "learning_rate": 4.949127650798063e-06, + "loss": 0.9256, + "step": 120 + }, + { + "epoch": 0.654054054054054, + "grad_norm": 2.9060285091400146, + "learning_rate": 4.948272034003275e-06, + "loss": 0.6892, + "step": 121 + }, + { + "epoch": 0.6594594594594595, + "grad_norm": 3.695594549179077, + "learning_rate": 4.947409357027148e-06, + "loss": 0.5878, + "step": 122 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 3.1250460147857666, + "learning_rate": 4.9465396223574165e-06, + "loss": 0.9904, + "step": 123 + }, + { + "epoch": 0.6702702702702703, + "grad_norm": 4.024891376495361, + "learning_rate": 4.945662832502172e-06, + "loss": 1.1592, + "step": 124 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 2.6886494159698486, + "learning_rate": 4.944778989989847e-06, + "loss": 1.0041, + "step": 125 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 2.366912841796875, + "learning_rate": 4.943888097369216e-06, + "loss": 0.7045, + "step": 126 + }, + { + "epoch": 0.6864864864864865, + "grad_norm": 2.394932270050049, + "learning_rate": 4.942990157209381e-06, + "loss": 0.6685, + "step": 127 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 2.61933970451355, + "learning_rate": 4.9420851720997674e-06, + "loss": 0.8812, + "step": 128 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 2.7395646572113037, + "learning_rate": 4.94117314465012e-06, + "loss": 1.3014, + "step": 129 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 3.065484046936035, + "learning_rate": 4.940254077490487e-06, + "loss": 0.6978, + "step": 130 + }, + { + "epoch": 0.7081081081081081, + "grad_norm": 2.895038366317749, + "learning_rate": 4.939327973271222e-06, + "loss": 0.6249, + "step": 131 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 3.1773312091827393, + "learning_rate": 4.9383948346629665e-06, + "loss": 0.6423, + "step": 132 + }, + { + "epoch": 0.7189189189189189, + "grad_norm": 2.2378008365631104, + "learning_rate": 4.937454664356652e-06, + "loss": 0.7193, + "step": 133 + }, + { + "epoch": 0.7243243243243244, + "grad_norm": 2.5673701763153076, + "learning_rate": 4.9365074650634855e-06, + "loss": 0.7065, + "step": 134 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 2.7348387241363525, + "learning_rate": 4.9355532395149445e-06, + "loss": 1.0046, + "step": 135 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 2.391741991043091, + "learning_rate": 4.9345919904627655e-06, + "loss": 0.6771, + "step": 136 + }, + { + "epoch": 0.7405405405405405, + "grad_norm": 2.2096705436706543, + "learning_rate": 4.933623720678944e-06, + "loss": 0.6589, + "step": 137 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 3.0840072631835938, + "learning_rate": 4.932648432955718e-06, + "loss": 0.8755, + "step": 138 + }, + { + "epoch": 0.7513513513513513, + "grad_norm": 2.4970428943634033, + "learning_rate": 4.931666130105564e-06, + "loss": 0.6685, + "step": 139 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 4.315455436706543, + "learning_rate": 4.930676814961189e-06, + "loss": 0.8101, + "step": 140 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 5.388065814971924, + "learning_rate": 4.92968049037552e-06, + "loss": 0.8193, + "step": 141 + }, + { + "epoch": 0.7675675675675676, + "grad_norm": 2.6107139587402344, + "learning_rate": 4.9286771592217005e-06, + "loss": 0.7852, + "step": 142 + }, + { + "epoch": 0.772972972972973, + "grad_norm": 3.936556577682495, + "learning_rate": 4.927666824393076e-06, + "loss": 1.0388, + "step": 143 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 2.74424409866333, + "learning_rate": 4.926649488803191e-06, + "loss": 0.8266, + "step": 144 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 2.8998451232910156, + "learning_rate": 4.925625155385776e-06, + "loss": 0.4895, + "step": 145 + }, + { + "epoch": 0.7891891891891892, + "grad_norm": 3.0631520748138428, + "learning_rate": 4.924593827094743e-06, + "loss": 0.8759, + "step": 146 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 3.233267307281494, + "learning_rate": 4.923555506904176e-06, + "loss": 0.701, + "step": 147 + }, + { + "epoch": 0.8, + "grad_norm": 2.87701416015625, + "learning_rate": 4.922510197808321e-06, + "loss": 1.1327, + "step": 148 + }, + { + "epoch": 0.8054054054054054, + "grad_norm": 3.650576114654541, + "learning_rate": 4.921457902821578e-06, + "loss": 0.7587, + "step": 149 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 3.232112407684326, + "learning_rate": 4.920398624978493e-06, + "loss": 1.2158, + "step": 150 + }, + { + "epoch": 0.8162162162162162, + "grad_norm": 2.468384027481079, + "learning_rate": 4.919332367333748e-06, + "loss": 0.6852, + "step": 151 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 2.5947415828704834, + "learning_rate": 4.918259132962154e-06, + "loss": 0.6611, + "step": 152 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 3.0171427726745605, + "learning_rate": 4.917178924958638e-06, + "loss": 0.7327, + "step": 153 + }, + { + "epoch": 0.8324324324324325, + "grad_norm": 3.293184518814087, + "learning_rate": 4.916091746438243e-06, + "loss": 0.8528, + "step": 154 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 4.0570969581604, + "learning_rate": 4.9149976005361085e-06, + "loss": 0.9141, + "step": 155 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 2.8782784938812256, + "learning_rate": 4.913896490407467e-06, + "loss": 1.1132, + "step": 156 + }, + { + "epoch": 0.8486486486486486, + "grad_norm": 2.5671517848968506, + "learning_rate": 4.912788419227635e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.8540540540540541, + "grad_norm": 2.9445390701293945, + "learning_rate": 4.911673390192002e-06, + "loss": 0.9227, + "step": 158 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 2.472595453262329, + "learning_rate": 4.910551406516023e-06, + "loss": 0.8154, + "step": 159 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 2.5233397483825684, + "learning_rate": 4.909422471435207e-06, + "loss": 0.9897, + "step": 160 + }, + { + "epoch": 0.8702702702702703, + "grad_norm": 3.3919546604156494, + "learning_rate": 4.90828658820511e-06, + "loss": 0.6162, + "step": 161 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 3.060908555984497, + "learning_rate": 4.907143760101325e-06, + "loss": 0.5734, + "step": 162 + }, + { + "epoch": 0.8810810810810811, + "grad_norm": 3.4584782123565674, + "learning_rate": 4.905993990419472e-06, + "loss": 0.8328, + "step": 163 + }, + { + "epoch": 0.8864864864864865, + "grad_norm": 2.936570644378662, + "learning_rate": 4.904837282475187e-06, + "loss": 0.6787, + "step": 164 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 2.564837694168091, + "learning_rate": 4.9036736396041165e-06, + "loss": 0.9658, + "step": 165 + }, + { + "epoch": 0.8972972972972973, + "grad_norm": 3.2509360313415527, + "learning_rate": 4.902503065161905e-06, + "loss": 0.7899, + "step": 166 + }, + { + "epoch": 0.9027027027027027, + "grad_norm": 2.9730329513549805, + "learning_rate": 4.901325562524185e-06, + "loss": 0.9476, + "step": 167 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 3.044980049133301, + "learning_rate": 4.900141135086569e-06, + "loss": 0.7589, + "step": 168 + }, + { + "epoch": 0.9135135135135135, + "grad_norm": 3.030585527420044, + "learning_rate": 4.898949786264638e-06, + "loss": 0.6724, + "step": 169 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 2.249122142791748, + "learning_rate": 4.897751519493933e-06, + "loss": 0.6968, + "step": 170 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 2.9816982746124268, + "learning_rate": 4.896546338229945e-06, + "loss": 0.7984, + "step": 171 + }, + { + "epoch": 0.9297297297297298, + "grad_norm": 2.415736675262451, + "learning_rate": 4.8953342459481034e-06, + "loss": 0.6109, + "step": 172 + }, + { + "epoch": 0.9351351351351351, + "grad_norm": 2.740518808364868, + "learning_rate": 4.894115246143768e-06, + "loss": 0.8126, + "step": 173 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 2.7610201835632324, + "learning_rate": 4.892889342332218e-06, + "loss": 0.6862, + "step": 174 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 3.057025194168091, + "learning_rate": 4.891656538048642e-06, + "loss": 0.9895, + "step": 175 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 2.569751262664795, + "learning_rate": 4.890416836848128e-06, + "loss": 0.8481, + "step": 176 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 2.4443397521972656, + "learning_rate": 4.889170242305652e-06, + "loss": 0.6478, + "step": 177 + }, + { + "epoch": 0.9621621621621622, + "grad_norm": 2.5009846687316895, + "learning_rate": 4.887916758016069e-06, + "loss": 0.9714, + "step": 178 + }, + { + "epoch": 0.9675675675675676, + "grad_norm": 3.101975202560425, + "learning_rate": 4.886656387594104e-06, + "loss": 1.1264, + "step": 179 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 2.6144704818725586, + "learning_rate": 4.885389134674338e-06, + "loss": 0.7664, + "step": 180 + }, + { + "epoch": 0.9783783783783784, + "grad_norm": 2.5834381580352783, + "learning_rate": 4.884115002911197e-06, + "loss": 0.6131, + "step": 181 + }, + { + "epoch": 0.9837837837837838, + "grad_norm": 2.5378055572509766, + "learning_rate": 4.88283399597895e-06, + "loss": 0.8733, + "step": 182 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 2.4095377922058105, + "learning_rate": 4.881546117571686e-06, + "loss": 0.643, + "step": 183 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 2.9554507732391357, + "learning_rate": 4.8802513714033135e-06, + "loss": 0.7287, + "step": 184 + }, + { + "epoch": 1.0, + "grad_norm": 2.8279213905334473, + "learning_rate": 4.878949761207545e-06, + "loss": 0.9927, + "step": 185 + }, + { + "epoch": 1.0054054054054054, + "grad_norm": 2.9361412525177, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.66, + "step": 186 + }, + { + "epoch": 1.0108108108108107, + "grad_norm": 3.392244338989258, + "learning_rate": 4.876325963767623e-06, + "loss": 0.594, + "step": 187 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 2.6276044845581055, + "learning_rate": 4.875003784089822e-06, + "loss": 0.5825, + "step": 188 + }, + { + "epoch": 1.0216216216216216, + "grad_norm": 2.2875545024871826, + "learning_rate": 4.873674755517305e-06, + "loss": 0.6594, + "step": 189 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 2.8086795806884766, + "learning_rate": 4.872338881882645e-06, + "loss": 0.7536, + "step": 190 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 2.3685200214385986, + "learning_rate": 4.870996167038154e-06, + "loss": 0.4849, + "step": 191 + }, + { + "epoch": 1.037837837837838, + "grad_norm": 3.0264766216278076, + "learning_rate": 4.869646614855877e-06, + "loss": 0.3771, + "step": 192 + }, + { + "epoch": 1.0432432432432432, + "grad_norm": 4.335122108459473, + "learning_rate": 4.868290229227567e-06, + "loss": 0.8545, + "step": 193 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 3.442172050476074, + "learning_rate": 4.866927014064692e-06, + "loss": 0.3698, + "step": 194 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 3.326539993286133, + "learning_rate": 4.86555697329841e-06, + "loss": 0.8468, + "step": 195 + }, + { + "epoch": 1.0594594594594595, + "grad_norm": 3.0372447967529297, + "learning_rate": 4.864180110879562e-06, + "loss": 0.8232, + "step": 196 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 2.955343008041382, + "learning_rate": 4.862796430778663e-06, + "loss": 0.4097, + "step": 197 + }, + { + "epoch": 1.0702702702702702, + "grad_norm": 2.4095399379730225, + "learning_rate": 4.861405936985889e-06, + "loss": 0.6746, + "step": 198 + }, + { + "epoch": 1.0756756756756758, + "grad_norm": 2.763500452041626, + "learning_rate": 4.860008633511059e-06, + "loss": 0.6605, + "step": 199 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 2.6751155853271484, + "learning_rate": 4.8586045243836384e-06, + "loss": 0.471, + "step": 200 + }, + { + "epoch": 1.0864864864864865, + "grad_norm": 3.3507862091064453, + "learning_rate": 4.857193613652711e-06, + "loss": 0.7665, + "step": 201 + }, + { + "epoch": 1.0918918918918918, + "grad_norm": 3.3064827919006348, + "learning_rate": 4.8557759053869775e-06, + "loss": 0.6436, + "step": 202 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 2.571828603744507, + "learning_rate": 4.854351403674741e-06, + "loss": 0.4642, + "step": 203 + }, + { + "epoch": 1.1027027027027028, + "grad_norm": 2.883220911026001, + "learning_rate": 4.852920112623895e-06, + "loss": 0.5737, + "step": 204 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 3.026144027709961, + "learning_rate": 4.851482036361912e-06, + "loss": 0.7302, + "step": 205 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 2.6689612865448, + "learning_rate": 4.850037179035829e-06, + "loss": 0.5229, + "step": 206 + }, + { + "epoch": 1.118918918918919, + "grad_norm": 2.4019956588745117, + "learning_rate": 4.8485855448122425e-06, + "loss": 0.5529, + "step": 207 + }, + { + "epoch": 1.1243243243243244, + "grad_norm": 2.3546230792999268, + "learning_rate": 4.847127137877286e-06, + "loss": 0.3635, + "step": 208 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 2.999096393585205, + "learning_rate": 4.8456619624366285e-06, + "loss": 0.8149, + "step": 209 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 10.072900772094727, + "learning_rate": 4.844190022715456e-06, + "loss": 0.8333, + "step": 210 + }, + { + "epoch": 1.1405405405405404, + "grad_norm": 2.222123384475708, + "learning_rate": 4.84271132295846e-06, + "loss": 0.3717, + "step": 211 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 2.8751113414764404, + "learning_rate": 4.841225867429826e-06, + "loss": 0.5994, + "step": 212 + }, + { + "epoch": 1.1513513513513514, + "grad_norm": 2.9580111503601074, + "learning_rate": 4.839733660413224e-06, + "loss": 0.8382, + "step": 213 + }, + { + "epoch": 1.1567567567567567, + "grad_norm": 4.628892421722412, + "learning_rate": 4.838234706211792e-06, + "loss": 0.818, + "step": 214 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 2.5103509426116943, + "learning_rate": 4.836729009148124e-06, + "loss": 0.4267, + "step": 215 + }, + { + "epoch": 1.1675675675675676, + "grad_norm": 2.6093738079071045, + "learning_rate": 4.835216573564261e-06, + "loss": 0.3472, + "step": 216 + }, + { + "epoch": 1.172972972972973, + "grad_norm": 3.0792338848114014, + "learning_rate": 4.833697403821672e-06, + "loss": 0.6323, + "step": 217 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 2.845163345336914, + "learning_rate": 4.8321715043012516e-06, + "loss": 0.6831, + "step": 218 + }, + { + "epoch": 1.1837837837837837, + "grad_norm": 3.0433948040008545, + "learning_rate": 4.830638879403296e-06, + "loss": 0.3682, + "step": 219 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 2.6533594131469727, + "learning_rate": 4.8290995335475e-06, + "loss": 0.4154, + "step": 220 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 2.9271352291107178, + "learning_rate": 4.827553471172935e-06, + "loss": 0.3991, + "step": 221 + }, + { + "epoch": 1.2, + "grad_norm": 2.9243528842926025, + "learning_rate": 4.826000696738045e-06, + "loss": 0.4538, + "step": 222 + }, + { + "epoch": 1.2054054054054055, + "grad_norm": 2.537332534790039, + "learning_rate": 4.824441214720629e-06, + "loss": 0.7692, + "step": 223 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 3.9193246364593506, + "learning_rate": 4.8228750296178275e-06, + "loss": 0.6038, + "step": 224 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 2.6646728515625, + "learning_rate": 4.821302145946113e-06, + "loss": 0.4147, + "step": 225 + }, + { + "epoch": 1.2216216216216216, + "grad_norm": 2.6519482135772705, + "learning_rate": 4.819722568241274e-06, + "loss": 0.5398, + "step": 226 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 2.2018048763275146, + "learning_rate": 4.818136301058401e-06, + "loss": 0.3864, + "step": 227 + }, + { + "epoch": 1.2324324324324325, + "grad_norm": 2.5660712718963623, + "learning_rate": 4.816543348971879e-06, + "loss": 0.5712, + "step": 228 + }, + { + "epoch": 1.2378378378378379, + "grad_norm": 3.237663745880127, + "learning_rate": 4.814943716575368e-06, + "loss": 0.662, + "step": 229 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 2.5570430755615234, + "learning_rate": 4.813337408481793e-06, + "loss": 0.8661, + "step": 230 + }, + { + "epoch": 1.2486486486486488, + "grad_norm": 2.9231269359588623, + "learning_rate": 4.811724429323329e-06, + "loss": 0.9218, + "step": 231 + }, + { + "epoch": 1.2540540540540541, + "grad_norm": 3.637084722518921, + "learning_rate": 4.810104783751389e-06, + "loss": 0.5597, + "step": 232 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 3.0218842029571533, + "learning_rate": 4.8084784764366125e-06, + "loss": 0.4786, + "step": 233 + }, + { + "epoch": 1.2648648648648648, + "grad_norm": 2.770214080810547, + "learning_rate": 4.806845512068846e-06, + "loss": 0.5219, + "step": 234 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 3.093053102493286, + "learning_rate": 4.805205895357137e-06, + "loss": 0.643, + "step": 235 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 2.6373348236083984, + "learning_rate": 4.803559631029713e-06, + "loss": 0.5858, + "step": 236 + }, + { + "epoch": 1.281081081081081, + "grad_norm": 2.452030897140503, + "learning_rate": 4.801906723833973e-06, + "loss": 0.4185, + "step": 237 + }, + { + "epoch": 1.2864864864864864, + "grad_norm": 2.72564697265625, + "learning_rate": 4.8002471785364734e-06, + "loss": 0.4917, + "step": 238 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 3.0389158725738525, + "learning_rate": 4.798580999922913e-06, + "loss": 0.645, + "step": 239 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 3.7002289295196533, + "learning_rate": 4.796908192798117e-06, + "loss": 0.5378, + "step": 240 + }, + { + "epoch": 1.3027027027027027, + "grad_norm": 2.1876111030578613, + "learning_rate": 4.7952287619860276e-06, + "loss": 0.5197, + "step": 241 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 3.903337240219116, + "learning_rate": 4.793542712329689e-06, + "loss": 1.0226, + "step": 242 + }, + { + "epoch": 1.3135135135135134, + "grad_norm": 2.3623552322387695, + "learning_rate": 4.791850048691228e-06, + "loss": 0.5502, + "step": 243 + }, + { + "epoch": 1.318918918918919, + "grad_norm": 3.0669031143188477, + "learning_rate": 4.79015077595185e-06, + "loss": 0.6976, + "step": 244 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 3.1480472087860107, + "learning_rate": 4.788444899011816e-06, + "loss": 0.4795, + "step": 245 + }, + { + "epoch": 1.3297297297297297, + "grad_norm": 3.7051920890808105, + "learning_rate": 4.786732422790432e-06, + "loss": 0.6526, + "step": 246 + }, + { + "epoch": 1.3351351351351353, + "grad_norm": 3.4358389377593994, + "learning_rate": 4.785013352226036e-06, + "loss": 0.5551, + "step": 247 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 2.3789355754852295, + "learning_rate": 4.7832876922759805e-06, + "loss": 0.3151, + "step": 248 + }, + { + "epoch": 1.345945945945946, + "grad_norm": 2.4843716621398926, + "learning_rate": 4.781555447916622e-06, + "loss": 0.6713, + "step": 249 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 3.0176303386688232, + "learning_rate": 4.779816624143302e-06, + "loss": 0.437, + "step": 250 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 2.868350028991699, + "learning_rate": 4.77807122597034e-06, + "loss": 0.7632, + "step": 251 + }, + { + "epoch": 1.3621621621621622, + "grad_norm": 2.4629738330841064, + "learning_rate": 4.776319258431009e-06, + "loss": 0.4894, + "step": 252 + }, + { + "epoch": 1.3675675675675676, + "grad_norm": 2.798297882080078, + "learning_rate": 4.77456072657753e-06, + "loss": 0.4456, + "step": 253 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 3.2977547645568848, + "learning_rate": 4.772795635481053e-06, + "loss": 0.5381, + "step": 254 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 4.1061906814575195, + "learning_rate": 4.77102399023164e-06, + "loss": 1.0302, + "step": 255 + }, + { + "epoch": 1.3837837837837839, + "grad_norm": 3.943284511566162, + "learning_rate": 4.769245795938261e-06, + "loss": 0.4875, + "step": 256 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 2.6420533657073975, + "learning_rate": 4.767461057728763e-06, + "loss": 0.4923, + "step": 257 + }, + { + "epoch": 1.3945945945945946, + "grad_norm": 3.3152263164520264, + "learning_rate": 4.76566978074987e-06, + "loss": 0.6699, + "step": 258 + }, + { + "epoch": 1.4, + "grad_norm": 2.6928882598876953, + "learning_rate": 4.7638719701671586e-06, + "loss": 0.6117, + "step": 259 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 2.706597328186035, + "learning_rate": 4.762067631165049e-06, + "loss": 0.8534, + "step": 260 + }, + { + "epoch": 1.4108108108108108, + "grad_norm": 2.9912848472595215, + "learning_rate": 4.760256768946787e-06, + "loss": 0.5057, + "step": 261 + }, + { + "epoch": 1.4162162162162162, + "grad_norm": 2.7098443508148193, + "learning_rate": 4.758439388734429e-06, + "loss": 0.7286, + "step": 262 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 3.1288092136383057, + "learning_rate": 4.7566154957688276e-06, + "loss": 0.9827, + "step": 263 + }, + { + "epoch": 1.427027027027027, + "grad_norm": 3.0505919456481934, + "learning_rate": 4.754785095309617e-06, + "loss": 0.7042, + "step": 264 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 2.6800339221954346, + "learning_rate": 4.752948192635199e-06, + "loss": 0.5179, + "step": 265 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 2.2246861457824707, + "learning_rate": 4.751104793042722e-06, + "loss": 0.8527, + "step": 266 + }, + { + "epoch": 1.4432432432432432, + "grad_norm": 2.4242751598358154, + "learning_rate": 4.7492549018480725e-06, + "loss": 0.5627, + "step": 267 + }, + { + "epoch": 1.4486486486486487, + "grad_norm": 2.763244152069092, + "learning_rate": 4.747398524385858e-06, + "loss": 0.8981, + "step": 268 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 2.856595993041992, + "learning_rate": 4.745535666009389e-06, + "loss": 0.5455, + "step": 269 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 2.4168624877929688, + "learning_rate": 4.743666332090664e-06, + "loss": 0.4348, + "step": 270 + }, + { + "epoch": 1.464864864864865, + "grad_norm": 2.5408060550689697, + "learning_rate": 4.74179052802036e-06, + "loss": 0.5524, + "step": 271 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 2.6216673851013184, + "learning_rate": 4.739908259207807e-06, + "loss": 0.7469, + "step": 272 + }, + { + "epoch": 1.4756756756756757, + "grad_norm": 5.397300720214844, + "learning_rate": 4.738019531080981e-06, + "loss": 0.7216, + "step": 273 + }, + { + "epoch": 1.481081081081081, + "grad_norm": 3.3481080532073975, + "learning_rate": 4.7361243490864825e-06, + "loss": 0.7527, + "step": 274 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 2.7943873405456543, + "learning_rate": 4.734222718689527e-06, + "loss": 0.7437, + "step": 275 + }, + { + "epoch": 1.491891891891892, + "grad_norm": 2.206890344619751, + "learning_rate": 4.732314645373922e-06, + "loss": 0.5187, + "step": 276 + }, + { + "epoch": 1.4972972972972973, + "grad_norm": 2.76442813873291, + "learning_rate": 4.730400134642055e-06, + "loss": 0.7186, + "step": 277 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 3.4754087924957275, + "learning_rate": 4.728479192014879e-06, + "loss": 0.9655, + "step": 278 + }, + { + "epoch": 1.5081081081081082, + "grad_norm": 2.923779249191284, + "learning_rate": 4.726551823031895e-06, + "loss": 0.6251, + "step": 279 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 3.1142773628234863, + "learning_rate": 4.7246180332511335e-06, + "loss": 0.4805, + "step": 280 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 2.3477070331573486, + "learning_rate": 4.722677828249142e-06, + "loss": 1.0939, + "step": 281 + }, + { + "epoch": 1.5243243243243243, + "grad_norm": 2.8418569564819336, + "learning_rate": 4.720731213620972e-06, + "loss": 0.9485, + "step": 282 + }, + { + "epoch": 1.5297297297297296, + "grad_norm": 2.462710380554199, + "learning_rate": 4.718778194980152e-06, + "loss": 0.5805, + "step": 283 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 3.2379209995269775, + "learning_rate": 4.7168187779586805e-06, + "loss": 0.77, + "step": 284 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 3.0701661109924316, + "learning_rate": 4.71485296820701e-06, + "loss": 0.5932, + "step": 285 + }, + { + "epoch": 1.545945945945946, + "grad_norm": 4.099547386169434, + "learning_rate": 4.7128807713940245e-06, + "loss": 0.6296, + "step": 286 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 2.5529167652130127, + "learning_rate": 4.710902193207028e-06, + "loss": 0.6201, + "step": 287 + }, + { + "epoch": 1.5567567567567568, + "grad_norm": 2.794926881790161, + "learning_rate": 4.708917239351727e-06, + "loss": 0.5682, + "step": 288 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 3.2522501945495605, + "learning_rate": 4.706925915552214e-06, + "loss": 0.8877, + "step": 289 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 2.811847448348999, + "learning_rate": 4.704928227550949e-06, + "loss": 0.6521, + "step": 290 + }, + { + "epoch": 1.572972972972973, + "grad_norm": 2.7060673236846924, + "learning_rate": 4.702924181108745e-06, + "loss": 0.4929, + "step": 291 + }, + { + "epoch": 1.5783783783783782, + "grad_norm": 2.5009031295776367, + "learning_rate": 4.700913782004755e-06, + "loss": 0.4515, + "step": 292 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 2.6722700595855713, + "learning_rate": 4.698897036036446e-06, + "loss": 0.5477, + "step": 293 + }, + { + "epoch": 1.5891891891891892, + "grad_norm": 3.3333957195281982, + "learning_rate": 4.696873949019591e-06, + "loss": 0.9589, + "step": 294 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 2.4862897396087646, + "learning_rate": 4.694844526788248e-06, + "loss": 0.4425, + "step": 295 + }, + { + "epoch": 1.6, + "grad_norm": 2.78708553314209, + "learning_rate": 4.692808775194745e-06, + "loss": 0.4899, + "step": 296 + }, + { + "epoch": 1.6054054054054054, + "grad_norm": 2.9121289253234863, + "learning_rate": 4.690766700109659e-06, + "loss": 0.4884, + "step": 297 + }, + { + "epoch": 1.6108108108108108, + "grad_norm": 4.692054271697998, + "learning_rate": 4.688718307421807e-06, + "loss": 0.8977, + "step": 298 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 3.1290926933288574, + "learning_rate": 4.686663603038222e-06, + "loss": 0.6833, + "step": 299 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 3.5091123580932617, + "learning_rate": 4.6846025928841365e-06, + "loss": 0.9141, + "step": 300 + }, + { + "epoch": 1.627027027027027, + "grad_norm": 2.5466184616088867, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.5121, + "step": 301 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 2.7833092212677, + "learning_rate": 4.68046167905631e-06, + "loss": 0.5399, + "step": 302 + }, + { + "epoch": 1.637837837837838, + "grad_norm": 3.05135440826416, + "learning_rate": 4.678381787323889e-06, + "loss": 0.7921, + "step": 303 + }, + { + "epoch": 1.6432432432432433, + "grad_norm": 2.2391726970672607, + "learning_rate": 4.676295613703577e-06, + "loss": 0.7178, + "step": 304 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 2.3654022216796875, + "learning_rate": 4.674203164211357e-06, + "loss": 0.7162, + "step": 305 + }, + { + "epoch": 1.654054054054054, + "grad_norm": 2.436009645462036, + "learning_rate": 4.67210444488131e-06, + "loss": 0.6539, + "step": 306 + }, + { + "epoch": 1.6594594594594594, + "grad_norm": 2.6034209728240967, + "learning_rate": 4.669999461765599e-06, + "loss": 0.7214, + "step": 307 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 2.804229497909546, + "learning_rate": 4.6678882209344474e-06, + "loss": 0.7451, + "step": 308 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 2.6239655017852783, + "learning_rate": 4.665770728476127e-06, + "loss": 0.6464, + "step": 309 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 2.9320099353790283, + "learning_rate": 4.663646990496939e-06, + "loss": 0.6669, + "step": 310 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 3.09713077545166, + "learning_rate": 4.661517013121189e-06, + "loss": 0.8972, + "step": 311 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 3.6576132774353027, + "learning_rate": 4.659380802491181e-06, + "loss": 0.6286, + "step": 312 + }, + { + "epoch": 1.691891891891892, + "grad_norm": 2.9320433139801025, + "learning_rate": 4.6572383647671915e-06, + "loss": 0.3631, + "step": 313 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 3.399357557296753, + "learning_rate": 4.655089706127457e-06, + "loss": 0.5682, + "step": 314 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 2.7667412757873535, + "learning_rate": 4.652934832768148e-06, + "loss": 0.5457, + "step": 315 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 2.3023321628570557, + "learning_rate": 4.650773750903363e-06, + "loss": 0.6601, + "step": 316 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 2.6584670543670654, + "learning_rate": 4.6486064667651005e-06, + "loss": 0.5882, + "step": 317 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 5.528168678283691, + "learning_rate": 4.646432986603245e-06, + "loss": 0.7628, + "step": 318 + }, + { + "epoch": 1.7243243243243245, + "grad_norm": 3.054884195327759, + "learning_rate": 4.644253316685552e-06, + "loss": 0.6877, + "step": 319 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 3.2672388553619385, + "learning_rate": 4.6420674632976205e-06, + "loss": 0.7026, + "step": 320 + }, + { + "epoch": 1.7351351351351352, + "grad_norm": 3.109384536743164, + "learning_rate": 4.639875432742886e-06, + "loss": 0.5236, + "step": 321 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 3.3593883514404297, + "learning_rate": 4.6376772313425975e-06, + "loss": 0.6463, + "step": 322 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 2.6352698802948, + "learning_rate": 4.635472865435795e-06, + "loss": 0.6903, + "step": 323 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 2.751690149307251, + "learning_rate": 4.6332623413792995e-06, + "loss": 0.7342, + "step": 324 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 2.670915126800537, + "learning_rate": 4.6310456655476874e-06, + "loss": 0.4302, + "step": 325 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 2.7648138999938965, + "learning_rate": 4.6288228443332786e-06, + "loss": 0.5108, + "step": 326 + }, + { + "epoch": 1.7675675675675677, + "grad_norm": 2.7451536655426025, + "learning_rate": 4.626593884146111e-06, + "loss": 0.7646, + "step": 327 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 2.4656403064727783, + "learning_rate": 4.624358791413928e-06, + "loss": 0.5529, + "step": 328 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 2.5987517833709717, + "learning_rate": 4.622117572582159e-06, + "loss": 0.609, + "step": 329 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 3.3843371868133545, + "learning_rate": 4.619870234113894e-06, + "loss": 0.9146, + "step": 330 + }, + { + "epoch": 1.7891891891891891, + "grad_norm": 2.3542068004608154, + "learning_rate": 4.617616782489878e-06, + "loss": 0.6887, + "step": 331 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 2.2049715518951416, + "learning_rate": 4.615357224208477e-06, + "loss": 0.505, + "step": 332 + }, + { + "epoch": 1.8, + "grad_norm": 2.453920364379883, + "learning_rate": 4.613091565785674e-06, + "loss": 0.8384, + "step": 333 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 2.5751583576202393, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5512, + "step": 334 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 2.524075984954834, + "learning_rate": 4.608541974667714e-06, + "loss": 0.4877, + "step": 335 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 2.2856955528259277, + "learning_rate": 4.606258055092397e-06, + "loss": 0.5583, + "step": 336 + }, + { + "epoch": 1.8216216216216217, + "grad_norm": 2.2773683071136475, + "learning_rate": 4.603968061615321e-06, + "loss": 0.5421, + "step": 337 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 4.085512161254883, + "learning_rate": 4.601672000840231e-06, + "loss": 0.942, + "step": 338 + }, + { + "epoch": 1.8324324324324324, + "grad_norm": 2.3710968494415283, + "learning_rate": 4.5993698793883715e-06, + "loss": 0.3773, + "step": 339 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 2.745534658432007, + "learning_rate": 4.597061703898462e-06, + "loss": 0.9694, + "step": 340 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 2.463207244873047, + "learning_rate": 4.594747481026685e-06, + "loss": 0.4667, + "step": 341 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 2.7216601371765137, + "learning_rate": 4.592427217446656e-06, + "loss": 0.4267, + "step": 342 + }, + { + "epoch": 1.8540540540540542, + "grad_norm": 2.545664072036743, + "learning_rate": 4.590100919849413e-06, + "loss": 0.9245, + "step": 343 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 3.692840337753296, + "learning_rate": 4.587768594943396e-06, + "loss": 0.7502, + "step": 344 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 2.993229627609253, + "learning_rate": 4.585430249454426e-06, + "loss": 0.4689, + "step": 345 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 2.162867546081543, + "learning_rate": 4.583085890125682e-06, + "loss": 0.6188, + "step": 346 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 2.2169792652130127, + "learning_rate": 4.5807355237176896e-06, + "loss": 0.6352, + "step": 347 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 3.978985548019409, + "learning_rate": 4.578379157008296e-06, + "loss": 0.464, + "step": 348 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 2.236682653427124, + "learning_rate": 4.57601679679265e-06, + "loss": 0.5943, + "step": 349 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 2.528754472732544, + "learning_rate": 4.573648449883188e-06, + "loss": 0.6949, + "step": 350 + }, + { + "epoch": 1.8972972972972975, + "grad_norm": 2.7673721313476562, + "learning_rate": 4.571274123109606e-06, + "loss": 0.4333, + "step": 351 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 2.698012351989746, + "learning_rate": 4.568893823318847e-06, + "loss": 0.6796, + "step": 352 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 2.9640560150146484, + "learning_rate": 4.566507557375077e-06, + "loss": 0.6139, + "step": 353 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 2.417628526687622, + "learning_rate": 4.5641153321596684e-06, + "loss": 0.4515, + "step": 354 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 2.676739454269409, + "learning_rate": 4.56171715457118e-06, + "loss": 0.8426, + "step": 355 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 2.8428189754486084, + "learning_rate": 4.559313031525331e-06, + "loss": 0.5806, + "step": 356 + }, + { + "epoch": 1.9297297297297298, + "grad_norm": 2.6817944049835205, + "learning_rate": 4.55690296995499e-06, + "loss": 0.5927, + "step": 357 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 3.5939931869506836, + "learning_rate": 4.554486976810149e-06, + "loss": 0.9986, + "step": 358 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 2.86688494682312, + "learning_rate": 4.552065059057906e-06, + "loss": 0.6813, + "step": 359 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 2.9295246601104736, + "learning_rate": 4.549637223682441e-06, + "loss": 1.0832, + "step": 360 + }, + { + "epoch": 1.9513513513513514, + "grad_norm": 2.6939451694488525, + "learning_rate": 4.547203477685005e-06, + "loss": 0.7377, + "step": 361 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 2.226055145263672, + "learning_rate": 4.544763828083888e-06, + "loss": 0.5412, + "step": 362 + }, + { + "epoch": 1.962162162162162, + "grad_norm": 2.490187406539917, + "learning_rate": 4.542318281914405e-06, + "loss": 0.6955, + "step": 363 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 2.9241302013397217, + "learning_rate": 4.53986684622888e-06, + "loss": 0.6774, + "step": 364 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 2.988084554672241, + "learning_rate": 4.537409528096615e-06, + "loss": 0.5832, + "step": 365 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 2.9380626678466797, + "learning_rate": 4.534946334603879e-06, + "loss": 0.606, + "step": 366 + }, + { + "epoch": 1.983783783783784, + "grad_norm": 2.667588710784912, + "learning_rate": 4.532477272853882e-06, + "loss": 0.4991, + "step": 367 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 2.9711899757385254, + "learning_rate": 4.530002349966759e-06, + "loss": 0.4442, + "step": 368 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 3.443957805633545, + "learning_rate": 4.5275215730795445e-06, + "loss": 0.6566, + "step": 369 + }, + { + "epoch": 2.0, + "grad_norm": 3.590317487716675, + "learning_rate": 4.525034949346156e-06, + "loss": 0.5687, + "step": 370 + }, + { + "epoch": 2.0054054054054054, + "grad_norm": 3.678600549697876, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4458, + "step": 371 + }, + { + "epoch": 2.0108108108108107, + "grad_norm": 3.803563356399536, + "learning_rate": 4.5200441900408045e-06, + "loss": 0.4418, + "step": 372 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 2.9187233448028564, + "learning_rate": 4.517540068860898e-06, + "loss": 0.7057, + "step": 373 + }, + { + "epoch": 2.0216216216216214, + "grad_norm": 2.693603515625, + "learning_rate": 4.515030129618884e-06, + "loss": 0.4491, + "step": 374 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 2.3883047103881836, + "learning_rate": 4.512514379552779e-06, + "loss": 0.3571, + "step": 375 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 4.558557033538818, + "learning_rate": 4.509992825917352e-06, + "loss": 0.5056, + "step": 376 + }, + { + "epoch": 2.037837837837838, + "grad_norm": 3.9574761390686035, + "learning_rate": 4.507465475984109e-06, + "loss": 0.6834, + "step": 377 + }, + { + "epoch": 2.0432432432432432, + "grad_norm": 5.34630012512207, + "learning_rate": 4.504932337041272e-06, + "loss": 0.6726, + "step": 378 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 3.198740243911743, + "learning_rate": 4.502393416393757e-06, + "loss": 0.4032, + "step": 379 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 3.347480297088623, + "learning_rate": 4.4998487213631515e-06, + "loss": 0.5442, + "step": 380 + }, + { + "epoch": 2.0594594594594593, + "grad_norm": 3.940531015396118, + "learning_rate": 4.497298259287696e-06, + "loss": 0.6181, + "step": 381 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 3.0910496711730957, + "learning_rate": 4.494742037522261e-06, + "loss": 0.3829, + "step": 382 + }, + { + "epoch": 2.0702702702702704, + "grad_norm": 4.060451984405518, + "learning_rate": 4.4921800634383295e-06, + "loss": 0.4953, + "step": 383 + }, + { + "epoch": 2.075675675675676, + "grad_norm": 3.1667511463165283, + "learning_rate": 4.4896123444239655e-06, + "loss": 0.3254, + "step": 384 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 3.0239670276641846, + "learning_rate": 4.487038887883809e-06, + "loss": 0.555, + "step": 385 + }, + { + "epoch": 2.0864864864864865, + "grad_norm": 2.8815383911132812, + "learning_rate": 4.484459701239038e-06, + "loss": 0.665, + "step": 386 + }, + { + "epoch": 2.091891891891892, + "grad_norm": 3.615537166595459, + "learning_rate": 4.481874791927358e-06, + "loss": 0.2652, + "step": 387 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 3.407407283782959, + "learning_rate": 4.479284167402977e-06, + "loss": 0.3811, + "step": 388 + }, + { + "epoch": 2.1027027027027025, + "grad_norm": 2.6651623249053955, + "learning_rate": 4.476687835136585e-06, + "loss": 0.2463, + "step": 389 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 3.5145862102508545, + "learning_rate": 4.47408580261533e-06, + "loss": 0.5507, + "step": 390 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 3.0952725410461426, + "learning_rate": 4.471478077342798e-06, + "loss": 0.288, + "step": 391 + }, + { + "epoch": 2.118918918918919, + "grad_norm": 2.634775400161743, + "learning_rate": 4.468864666838994e-06, + "loss": 0.5169, + "step": 392 + }, + { + "epoch": 2.1243243243243244, + "grad_norm": 3.7388594150543213, + "learning_rate": 4.4662455786403125e-06, + "loss": 0.3327, + "step": 393 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 3.8197360038757324, + "learning_rate": 4.463620820299528e-06, + "loss": 0.3877, + "step": 394 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 3.0073485374450684, + "learning_rate": 4.4609903993857606e-06, + "loss": 0.5425, + "step": 395 + }, + { + "epoch": 2.1405405405405404, + "grad_norm": 2.6923868656158447, + "learning_rate": 4.458354323484462e-06, + "loss": 0.5257, + "step": 396 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 3.2151331901550293, + "learning_rate": 4.45571260019739e-06, + "loss": 0.3914, + "step": 397 + }, + { + "epoch": 2.1513513513513516, + "grad_norm": 3.4031248092651367, + "learning_rate": 4.453065237142592e-06, + "loss": 0.3455, + "step": 398 + }, + { + "epoch": 2.156756756756757, + "grad_norm": 3.012275457382202, + "learning_rate": 4.4504122419543745e-06, + "loss": 0.4652, + "step": 399 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 3.3084208965301514, + "learning_rate": 4.4477536222832865e-06, + "loss": 0.6343, + "step": 400 + }, + { + "epoch": 2.1675675675675676, + "grad_norm": 3.115206241607666, + "learning_rate": 4.445089385796099e-06, + "loss": 0.6975, + "step": 401 + }, + { + "epoch": 2.172972972972973, + "grad_norm": 2.893930435180664, + "learning_rate": 4.442419540175778e-06, + "loss": 0.5779, + "step": 402 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 3.0549168586730957, + "learning_rate": 4.439744093121465e-06, + "loss": 0.4541, + "step": 403 + }, + { + "epoch": 2.1837837837837837, + "grad_norm": 3.1189024448394775, + "learning_rate": 4.437063052348457e-06, + "loss": 0.4078, + "step": 404 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 6.644659042358398, + "learning_rate": 4.434376425588179e-06, + "loss": 0.6759, + "step": 405 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 2.807554006576538, + "learning_rate": 4.431684220588163e-06, + "loss": 0.2938, + "step": 406 + }, + { + "epoch": 2.2, + "grad_norm": 3.6900999546051025, + "learning_rate": 4.428986445112034e-06, + "loss": 0.676, + "step": 407 + }, + { + "epoch": 2.2054054054054055, + "grad_norm": 2.0721664428710938, + "learning_rate": 4.426283106939474e-06, + "loss": 0.1859, + "step": 408 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 2.953388214111328, + "learning_rate": 4.423574213866209e-06, + "loss": 0.2955, + "step": 409 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 3.049050807952881, + "learning_rate": 4.420859773703985e-06, + "loss": 0.2262, + "step": 410 + }, + { + "epoch": 2.2216216216216216, + "grad_norm": 3.319796323776245, + "learning_rate": 4.418139794280542e-06, + "loss": 0.2273, + "step": 411 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 2.4133522510528564, + "learning_rate": 4.415414283439595e-06, + "loss": 0.3282, + "step": 412 + }, + { + "epoch": 2.2324324324324323, + "grad_norm": 2.9842193126678467, + "learning_rate": 4.4126832490408116e-06, + "loss": 0.3651, + "step": 413 + }, + { + "epoch": 2.237837837837838, + "grad_norm": 2.759531259536743, + "learning_rate": 4.409946698959784e-06, + "loss": 0.4052, + "step": 414 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 3.045485019683838, + "learning_rate": 4.4072046410880145e-06, + "loss": 0.4638, + "step": 415 + }, + { + "epoch": 2.2486486486486488, + "grad_norm": 3.0058295726776123, + "learning_rate": 4.404457083332887e-06, + "loss": 0.517, + "step": 416 + }, + { + "epoch": 2.254054054054054, + "grad_norm": 3.025688409805298, + "learning_rate": 4.401704033617643e-06, + "loss": 0.6902, + "step": 417 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 3.3047802448272705, + "learning_rate": 4.398945499881366e-06, + "loss": 0.3552, + "step": 418 + }, + { + "epoch": 2.264864864864865, + "grad_norm": 3.0683655738830566, + "learning_rate": 4.396181490078949e-06, + "loss": 0.286, + "step": 419 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 3.627681016921997, + "learning_rate": 4.393412012181082e-06, + "loss": 0.4036, + "step": 420 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 4.552238941192627, + "learning_rate": 4.390637074174219e-06, + "loss": 0.8037, + "step": 421 + }, + { + "epoch": 2.281081081081081, + "grad_norm": 2.8688855171203613, + "learning_rate": 4.387856684060561e-06, + "loss": 0.2553, + "step": 422 + }, + { + "epoch": 2.2864864864864867, + "grad_norm": 4.21850061416626, + "learning_rate": 4.385070849858033e-06, + "loss": 0.6222, + "step": 423 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 3.038433790206909, + "learning_rate": 4.382279579600257e-06, + "loss": 0.5326, + "step": 424 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 3.297300338745117, + "learning_rate": 4.379482881336532e-06, + "loss": 0.5515, + "step": 425 + }, + { + "epoch": 2.3027027027027027, + "grad_norm": 7.162952423095703, + "learning_rate": 4.376680763131811e-06, + "loss": 0.6948, + "step": 426 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 3.2403595447540283, + "learning_rate": 4.373873233066676e-06, + "loss": 0.2947, + "step": 427 + }, + { + "epoch": 2.3135135135135134, + "grad_norm": 3.2969906330108643, + "learning_rate": 4.371060299237315e-06, + "loss": 0.2261, + "step": 428 + }, + { + "epoch": 2.3189189189189188, + "grad_norm": 2.669058322906494, + "learning_rate": 4.368241969755499e-06, + "loss": 0.5398, + "step": 429 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 2.7643518447875977, + "learning_rate": 4.36541825274856e-06, + "loss": 0.3301, + "step": 430 + }, + { + "epoch": 2.32972972972973, + "grad_norm": 3.6037657260894775, + "learning_rate": 4.3625891563593635e-06, + "loss": 0.6064, + "step": 431 + }, + { + "epoch": 2.3351351351351353, + "grad_norm": 2.8805618286132812, + "learning_rate": 4.35975468874629e-06, + "loss": 0.3897, + "step": 432 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 2.642402172088623, + "learning_rate": 4.356914858083211e-06, + "loss": 0.271, + "step": 433 + }, + { + "epoch": 2.345945945945946, + "grad_norm": 2.916337490081787, + "learning_rate": 4.354069672559458e-06, + "loss": 0.3681, + "step": 434 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 3.3312325477600098, + "learning_rate": 4.35121914037981e-06, + "loss": 0.298, + "step": 435 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 2.980583906173706, + "learning_rate": 4.348363269764462e-06, + "loss": 0.3618, + "step": 436 + }, + { + "epoch": 2.362162162162162, + "grad_norm": 3.5010197162628174, + "learning_rate": 4.345502068949003e-06, + "loss": 0.8972, + "step": 437 + }, + { + "epoch": 2.3675675675675674, + "grad_norm": 2.7187814712524414, + "learning_rate": 4.342635546184394e-06, + "loss": 0.3939, + "step": 438 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 2.8368170261383057, + "learning_rate": 4.339763709736944e-06, + "loss": 0.5462, + "step": 439 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 2.6989636421203613, + "learning_rate": 4.336886567888283e-06, + "loss": 0.5932, + "step": 440 + }, + { + "epoch": 2.383783783783784, + "grad_norm": 3.2514829635620117, + "learning_rate": 4.334004128935342e-06, + "loss": 0.4622, + "step": 441 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 5.242766857147217, + "learning_rate": 4.331116401190327e-06, + "loss": 0.5997, + "step": 442 + }, + { + "epoch": 2.3945945945945946, + "grad_norm": 3.492724657058716, + "learning_rate": 4.328223392980696e-06, + "loss": 0.3072, + "step": 443 + }, + { + "epoch": 2.4, + "grad_norm": 4.074132442474365, + "learning_rate": 4.325325112649134e-06, + "loss": 0.5338, + "step": 444 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 2.7208468914031982, + "learning_rate": 4.322421568553529e-06, + "loss": 0.3266, + "step": 445 + }, + { + "epoch": 2.410810810810811, + "grad_norm": 2.929180383682251, + "learning_rate": 4.3195127690669494e-06, + "loss": 0.4064, + "step": 446 + }, + { + "epoch": 2.4162162162162164, + "grad_norm": 2.848353624343872, + "learning_rate": 4.3165987225776186e-06, + "loss": 0.3856, + "step": 447 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 3.946488618850708, + "learning_rate": 4.313679437488889e-06, + "loss": 0.4261, + "step": 448 + }, + { + "epoch": 2.427027027027027, + "grad_norm": 5.781888961791992, + "learning_rate": 4.310754922219223e-06, + "loss": 0.4943, + "step": 449 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 2.8406941890716553, + "learning_rate": 4.307825185202164e-06, + "loss": 0.2874, + "step": 450 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 3.2017335891723633, + "learning_rate": 4.3048902348863116e-06, + "loss": 0.4218, + "step": 451 + }, + { + "epoch": 2.443243243243243, + "grad_norm": 3.8355906009674072, + "learning_rate": 4.301950079735303e-06, + "loss": 0.4204, + "step": 452 + }, + { + "epoch": 2.4486486486486485, + "grad_norm": 4.783357620239258, + "learning_rate": 4.299004728227782e-06, + "loss": 0.5593, + "step": 453 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 3.014080762863159, + "learning_rate": 4.2960541888573774e-06, + "loss": 0.4187, + "step": 454 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 3.5906598567962646, + "learning_rate": 4.29309847013268e-06, + "loss": 0.4193, + "step": 455 + }, + { + "epoch": 2.464864864864865, + "grad_norm": 3.9043331146240234, + "learning_rate": 4.290137580577216e-06, + "loss": 0.7035, + "step": 456 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 3.139753580093384, + "learning_rate": 4.287171528729423e-06, + "loss": 0.5877, + "step": 457 + }, + { + "epoch": 2.4756756756756757, + "grad_norm": 2.9091074466705322, + "learning_rate": 4.284200323142623e-06, + "loss": 0.5309, + "step": 458 + }, + { + "epoch": 2.481081081081081, + "grad_norm": 3.1253795623779297, + "learning_rate": 4.281223972385004e-06, + "loss": 0.448, + "step": 459 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 2.65510892868042, + "learning_rate": 4.27824248503959e-06, + "loss": 0.4453, + "step": 460 + }, + { + "epoch": 2.4918918918918918, + "grad_norm": 3.2135510444641113, + "learning_rate": 4.275255869704214e-06, + "loss": 0.5582, + "step": 461 + }, + { + "epoch": 2.4972972972972975, + "grad_norm": 2.452545404434204, + "learning_rate": 4.272264134991503e-06, + "loss": 0.423, + "step": 462 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 2.6370208263397217, + "learning_rate": 4.269267289528843e-06, + "loss": 0.271, + "step": 463 + }, + { + "epoch": 2.5081081081081082, + "grad_norm": 3.31266450881958, + "learning_rate": 4.266265341958356e-06, + "loss": 0.6459, + "step": 464 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 3.2743148803710938, + "learning_rate": 4.263258300936882e-06, + "loss": 0.2959, + "step": 465 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 2.883549690246582, + "learning_rate": 4.260246175135948e-06, + "loss": 0.3418, + "step": 466 + }, + { + "epoch": 2.5243243243243243, + "grad_norm": 2.7019498348236084, + "learning_rate": 4.257228973241742e-06, + "loss": 0.3459, + "step": 467 + }, + { + "epoch": 2.5297297297297296, + "grad_norm": 3.8166959285736084, + "learning_rate": 4.254206703955092e-06, + "loss": 0.4769, + "step": 468 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 3.264763593673706, + "learning_rate": 4.251179375991438e-06, + "loss": 0.6487, + "step": 469 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 2.7936933040618896, + "learning_rate": 4.248146998080808e-06, + "loss": 0.5547, + "step": 470 + }, + { + "epoch": 2.545945945945946, + "grad_norm": 3.21852707862854, + "learning_rate": 4.2451095789677945e-06, + "loss": 0.2965, + "step": 471 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 3.4528985023498535, + "learning_rate": 4.242067127411525e-06, + "loss": 0.3831, + "step": 472 + }, + { + "epoch": 2.556756756756757, + "grad_norm": 4.317023754119873, + "learning_rate": 4.239019652185642e-06, + "loss": 0.1756, + "step": 473 + }, + { + "epoch": 2.562162162162162, + "grad_norm": 3.677452325820923, + "learning_rate": 4.2359671620782725e-06, + "loss": 0.5136, + "step": 474 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 3.7563393115997314, + "learning_rate": 4.232909665892005e-06, + "loss": 0.6554, + "step": 475 + }, + { + "epoch": 2.572972972972973, + "grad_norm": 3.5125508308410645, + "learning_rate": 4.229847172443866e-06, + "loss": 0.3804, + "step": 476 + }, + { + "epoch": 2.5783783783783782, + "grad_norm": 2.8835806846618652, + "learning_rate": 4.2267796905652926e-06, + "loss": 0.3338, + "step": 477 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 3.2136261463165283, + "learning_rate": 4.223707229102105e-06, + "loss": 0.6163, + "step": 478 + }, + { + "epoch": 2.589189189189189, + "grad_norm": 3.467475175857544, + "learning_rate": 4.220629796914487e-06, + "loss": 0.3005, + "step": 479 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 3.597490072250366, + "learning_rate": 4.217547402876954e-06, + "loss": 0.56, + "step": 480 + }, + { + "epoch": 2.6, + "grad_norm": 3.2377140522003174, + "learning_rate": 4.214460055878329e-06, + "loss": 0.4512, + "step": 481 + }, + { + "epoch": 2.6054054054054054, + "grad_norm": 2.577746868133545, + "learning_rate": 4.211367764821722e-06, + "loss": 0.3074, + "step": 482 + }, + { + "epoch": 2.610810810810811, + "grad_norm": 3.6584155559539795, + "learning_rate": 4.208270538624497e-06, + "loss": 0.6752, + "step": 483 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 2.602778434753418, + "learning_rate": 4.205168386218251e-06, + "loss": 0.2347, + "step": 484 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 3.587503433227539, + "learning_rate": 4.2020613165487865e-06, + "loss": 0.5189, + "step": 485 + }, + { + "epoch": 2.627027027027027, + "grad_norm": 3.9341986179351807, + "learning_rate": 4.198949338576086e-06, + "loss": 0.7739, + "step": 486 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 2.9211957454681396, + "learning_rate": 4.1958324612742875e-06, + "loss": 0.3495, + "step": 487 + }, + { + "epoch": 2.637837837837838, + "grad_norm": 3.29193115234375, + "learning_rate": 4.1927106936316564e-06, + "loss": 0.2257, + "step": 488 + }, + { + "epoch": 2.6432432432432433, + "grad_norm": 3.3687057495117188, + "learning_rate": 4.189584044650559e-06, + "loss": 0.6708, + "step": 489 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 3.096428155899048, + "learning_rate": 4.186452523347441e-06, + "loss": 0.3126, + "step": 490 + }, + { + "epoch": 2.654054054054054, + "grad_norm": 3.0865559577941895, + "learning_rate": 4.183316138752799e-06, + "loss": 0.4219, + "step": 491 + }, + { + "epoch": 2.6594594594594594, + "grad_norm": 3.389827013015747, + "learning_rate": 4.180174899911149e-06, + "loss": 0.3937, + "step": 492 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 3.044360637664795, + "learning_rate": 4.177028815881012e-06, + "loss": 0.4098, + "step": 493 + }, + { + "epoch": 2.6702702702702705, + "grad_norm": 2.813094139099121, + "learning_rate": 4.173877895734875e-06, + "loss": 0.3597, + "step": 494 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 2.4037158489227295, + "learning_rate": 4.1707221485591764e-06, + "loss": 0.3284, + "step": 495 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 3.049436092376709, + "learning_rate": 4.167561583454272e-06, + "loss": 0.257, + "step": 496 + }, + { + "epoch": 2.6864864864864866, + "grad_norm": 3.458923816680908, + "learning_rate": 4.164396209534411e-06, + "loss": 0.1819, + "step": 497 + }, + { + "epoch": 2.691891891891892, + "grad_norm": 3.3084232807159424, + "learning_rate": 4.161226035927711e-06, + "loss": 0.7109, + "step": 498 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 3.034550189971924, + "learning_rate": 4.15805107177613e-06, + "loss": 0.6297, + "step": 499 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 3.5786449909210205, + "learning_rate": 4.15487132623544e-06, + "loss": 0.5195, + "step": 500 + }, + { + "epoch": 2.708108108108108, + "grad_norm": 3.4477646350860596, + "learning_rate": 4.151686808475204e-06, + "loss": 0.2528, + "step": 501 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 3.0256869792938232, + "learning_rate": 4.148497527678744e-06, + "loss": 0.5013, + "step": 502 + }, + { + "epoch": 2.718918918918919, + "grad_norm": 2.875121593475342, + "learning_rate": 4.145303493043118e-06, + "loss": 0.4109, + "step": 503 + }, + { + "epoch": 2.7243243243243245, + "grad_norm": 2.7204222679138184, + "learning_rate": 4.1421047137790935e-06, + "loss": 0.3197, + "step": 504 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 3.350482702255249, + "learning_rate": 4.13890119911112e-06, + "loss": 0.6369, + "step": 505 + }, + { + "epoch": 2.735135135135135, + "grad_norm": 3.096774101257324, + "learning_rate": 4.135692958277303e-06, + "loss": 0.4581, + "step": 506 + }, + { + "epoch": 2.7405405405405405, + "grad_norm": 2.8896536827087402, + "learning_rate": 4.132480000529375e-06, + "loss": 0.6217, + "step": 507 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 2.643932580947876, + "learning_rate": 4.129262335132676e-06, + "loss": 0.4951, + "step": 508 + }, + { + "epoch": 2.7513513513513512, + "grad_norm": 2.6077864170074463, + "learning_rate": 4.126039971366114e-06, + "loss": 0.2185, + "step": 509 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 2.531507968902588, + "learning_rate": 4.122812918522154e-06, + "loss": 0.5428, + "step": 510 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 4.125836372375488, + "learning_rate": 4.119581185906776e-06, + "loss": 0.5466, + "step": 511 + }, + { + "epoch": 2.7675675675675677, + "grad_norm": 2.9921016693115234, + "learning_rate": 4.1163447828394595e-06, + "loss": 0.3803, + "step": 512 + }, + { + "epoch": 2.772972972972973, + "grad_norm": 2.9517931938171387, + "learning_rate": 4.113103718653152e-06, + "loss": 0.2722, + "step": 513 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 2.8333382606506348, + "learning_rate": 4.10985800269424e-06, + "loss": 0.333, + "step": 514 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 2.94168758392334, + "learning_rate": 4.106607644322529e-06, + "loss": 0.2186, + "step": 515 + }, + { + "epoch": 2.789189189189189, + "grad_norm": 3.2743892669677734, + "learning_rate": 4.103352652911207e-06, + "loss": 0.6365, + "step": 516 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 4.692770004272461, + "learning_rate": 4.100093037846825e-06, + "loss": 0.7261, + "step": 517 + }, + { + "epoch": 2.8, + "grad_norm": 3.2157247066497803, + "learning_rate": 4.0968288085292675e-06, + "loss": 0.2767, + "step": 518 + }, + { + "epoch": 2.8054054054054056, + "grad_norm": 3.196887731552124, + "learning_rate": 4.093559974371725e-06, + "loss": 0.4743, + "step": 519 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 2.406752586364746, + "learning_rate": 4.090286544800667e-06, + "loss": 0.3789, + "step": 520 + }, + { + "epoch": 2.8162162162162163, + "grad_norm": 3.1769447326660156, + "learning_rate": 4.087008529255815e-06, + "loss": 0.6252, + "step": 521 + }, + { + "epoch": 2.8216216216216217, + "grad_norm": 3.068370819091797, + "learning_rate": 4.083725937190115e-06, + "loss": 0.3467, + "step": 522 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 3.2665855884552, + "learning_rate": 4.0804387780697114e-06, + "loss": 0.3857, + "step": 523 + }, + { + "epoch": 2.8324324324324324, + "grad_norm": 3.368759870529175, + "learning_rate": 4.077147061373918e-06, + "loss": 0.4679, + "step": 524 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 3.989163875579834, + "learning_rate": 4.073850796595192e-06, + "loss": 0.2439, + "step": 525 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 3.6244685649871826, + "learning_rate": 4.070549993239106e-06, + "loss": 0.435, + "step": 526 + }, + { + "epoch": 2.8486486486486484, + "grad_norm": 3.585151195526123, + "learning_rate": 4.06724466082432e-06, + "loss": 0.5022, + "step": 527 + }, + { + "epoch": 2.854054054054054, + "grad_norm": 3.2420976161956787, + "learning_rate": 4.063934808882555e-06, + "loss": 0.4282, + "step": 528 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 3.1674294471740723, + "learning_rate": 4.0606204469585656e-06, + "loss": 0.3436, + "step": 529 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 2.6856706142425537, + "learning_rate": 4.057301584610112e-06, + "loss": 0.3889, + "step": 530 + }, + { + "epoch": 2.8702702702702703, + "grad_norm": 3.0438942909240723, + "learning_rate": 4.053978231407931e-06, + "loss": 0.4828, + "step": 531 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 3.3561246395111084, + "learning_rate": 4.0506503969357115e-06, + "loss": 0.5814, + "step": 532 + }, + { + "epoch": 2.881081081081081, + "grad_norm": 2.5318350791931152, + "learning_rate": 4.047318090790065e-06, + "loss": 0.4768, + "step": 533 + }, + { + "epoch": 2.8864864864864863, + "grad_norm": 2.587224006652832, + "learning_rate": 4.043981322580498e-06, + "loss": 0.4262, + "step": 534 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 2.73926043510437, + "learning_rate": 4.040640101929384e-06, + "loss": 0.421, + "step": 535 + }, + { + "epoch": 2.8972972972972975, + "grad_norm": 3.53908371925354, + "learning_rate": 4.037294438471936e-06, + "loss": 0.4019, + "step": 536 + }, + { + "epoch": 2.902702702702703, + "grad_norm": 3.0980448722839355, + "learning_rate": 4.033944341856181e-06, + "loss": 0.4322, + "step": 537 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 2.9265666007995605, + "learning_rate": 4.030589821742926e-06, + "loss": 0.3841, + "step": 538 + }, + { + "epoch": 2.9135135135135135, + "grad_norm": 3.4082043170928955, + "learning_rate": 4.0272308878057385e-06, + "loss": 0.7083, + "step": 539 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 3.297515630722046, + "learning_rate": 4.023867549730912e-06, + "loss": 0.5688, + "step": 540 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 3.0538225173950195, + "learning_rate": 4.020499817217441e-06, + "loss": 0.5979, + "step": 541 + }, + { + "epoch": 2.92972972972973, + "grad_norm": 3.1792757511138916, + "learning_rate": 4.017127699976992e-06, + "loss": 0.5034, + "step": 542 + }, + { + "epoch": 2.935135135135135, + "grad_norm": 3.1574482917785645, + "learning_rate": 4.013751207733877e-06, + "loss": 0.6656, + "step": 543 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 2.523123264312744, + "learning_rate": 4.010370350225023e-06, + "loss": 0.2789, + "step": 544 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 3.1950793266296387, + "learning_rate": 4.006985137199945e-06, + "loss": 0.2163, + "step": 545 + }, + { + "epoch": 2.9513513513513514, + "grad_norm": 3.2089648246765137, + "learning_rate": 4.00359557842072e-06, + "loss": 0.4179, + "step": 546 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 3.852578639984131, + "learning_rate": 4.000201683661958e-06, + "loss": 0.4683, + "step": 547 + }, + { + "epoch": 2.962162162162162, + "grad_norm": 2.7612597942352295, + "learning_rate": 3.996803462710766e-06, + "loss": 0.3506, + "step": 548 + }, + { + "epoch": 2.9675675675675675, + "grad_norm": 4.811823844909668, + "learning_rate": 3.993400925366736e-06, + "loss": 0.6582, + "step": 549 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 3.0135858058929443, + "learning_rate": 3.989994081441902e-06, + "loss": 0.504, + "step": 550 + }, + { + "epoch": 2.9783783783783786, + "grad_norm": 2.710277795791626, + "learning_rate": 3.986582940760717e-06, + "loss": 0.7362, + "step": 551 + }, + { + "epoch": 2.983783783783784, + "grad_norm": 3.175443649291992, + "learning_rate": 3.983167513160025e-06, + "loss": 0.4116, + "step": 552 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 3.101109743118286, + "learning_rate": 3.979747808489036e-06, + "loss": 0.2188, + "step": 553 + }, + { + "epoch": 2.9945945945945946, + "grad_norm": 3.2320079803466797, + "learning_rate": 3.976323836609289e-06, + "loss": 0.7558, + "step": 554 + }, + { + "epoch": 3.0, + "grad_norm": 3.6071934700012207, + "learning_rate": 3.9728956073946305e-06, + "loss": 0.6491, + "step": 555 + }, + { + "epoch": 3.0054054054054054, + "grad_norm": 3.1119353771209717, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1625, + "step": 556 + }, + { + "epoch": 3.0108108108108107, + "grad_norm": 3.0440328121185303, + "learning_rate": 3.966026416517321e-06, + "loss": 0.311, + "step": 557 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 4.069122791290283, + "learning_rate": 3.962585474663636e-06, + "loss": 0.5299, + "step": 558 + }, + { + "epoch": 3.0216216216216214, + "grad_norm": 2.878645896911621, + "learning_rate": 3.959140315092911e-06, + "loss": 0.2718, + "step": 559 + }, + { + "epoch": 3.027027027027027, + "grad_norm": 3.526695966720581, + "learning_rate": 3.955690947740092e-06, + "loss": 0.2954, + "step": 560 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 3.25087308883667, + "learning_rate": 3.95223738255226e-06, + "loss": 0.2388, + "step": 561 + }, + { + "epoch": 3.037837837837838, + "grad_norm": 3.5467700958251953, + "learning_rate": 3.9487796294886015e-06, + "loss": 0.2014, + "step": 562 + }, + { + "epoch": 3.0432432432432432, + "grad_norm": 4.397517681121826, + "learning_rate": 3.945317698520379e-06, + "loss": 0.2102, + "step": 563 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 3.7297182083129883, + "learning_rate": 3.941851599630903e-06, + "loss": 0.499, + "step": 564 + }, + { + "epoch": 3.054054054054054, + "grad_norm": 4.417158603668213, + "learning_rate": 3.938381342815503e-06, + "loss": 0.3392, + "step": 565 + }, + { + "epoch": 3.0594594594594593, + "grad_norm": 4.6037421226501465, + "learning_rate": 3.934906938081499e-06, + "loss": 0.1942, + "step": 566 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 3.5600531101226807, + "learning_rate": 3.931428395448174e-06, + "loss": 0.1753, + "step": 567 + }, + { + "epoch": 3.0702702702702704, + "grad_norm": 2.868013381958008, + "learning_rate": 3.927945724946743e-06, + "loss": 0.2959, + "step": 568 + }, + { + "epoch": 3.075675675675676, + "grad_norm": 3.5543227195739746, + "learning_rate": 3.924458936620322e-06, + "loss": 0.4625, + "step": 569 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 8.972922325134277, + "learning_rate": 3.920968040523904e-06, + "loss": 0.2571, + "step": 570 + }, + { + "epoch": 3.0864864864864865, + "grad_norm": 3.037388324737549, + "learning_rate": 3.917473046724329e-06, + "loss": 0.1438, + "step": 571 + }, + { + "epoch": 3.091891891891892, + "grad_norm": 3.3261702060699463, + "learning_rate": 3.9139739653002525e-06, + "loss": 0.3572, + "step": 572 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 2.425293207168579, + "learning_rate": 3.910470806342117e-06, + "loss": 0.165, + "step": 573 + }, + { + "epoch": 3.1027027027027025, + "grad_norm": 3.5718603134155273, + "learning_rate": 3.9069635799521245e-06, + "loss": 0.3209, + "step": 574 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 3.8211171627044678, + "learning_rate": 3.903452296244204e-06, + "loss": 0.1976, + "step": 575 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 5.944535255432129, + "learning_rate": 3.899936965343989e-06, + "loss": 0.6074, + "step": 576 + }, + { + "epoch": 3.118918918918919, + "grad_norm": 6.603860378265381, + "learning_rate": 3.89641759738878e-06, + "loss": 0.4051, + "step": 577 + }, + { + "epoch": 3.1243243243243244, + "grad_norm": 6.712981700897217, + "learning_rate": 3.892894202527523e-06, + "loss": 0.3787, + "step": 578 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 3.267186403274536, + "learning_rate": 3.8893667909207735e-06, + "loss": 0.0927, + "step": 579 + }, + { + "epoch": 3.135135135135135, + "grad_norm": 4.476837158203125, + "learning_rate": 3.88583537274067e-06, + "loss": 0.4706, + "step": 580 + }, + { + "epoch": 3.1405405405405404, + "grad_norm": 4.272335052490234, + "learning_rate": 3.8822999581709085e-06, + "loss": 0.3949, + "step": 581 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 3.6685309410095215, + "learning_rate": 3.878760557406708e-06, + "loss": 0.1971, + "step": 582 + }, + { + "epoch": 3.1513513513513516, + "grad_norm": 3.9899449348449707, + "learning_rate": 3.875217180654779e-06, + "loss": 0.5156, + "step": 583 + }, + { + "epoch": 3.156756756756757, + "grad_norm": 3.866804361343384, + "learning_rate": 3.871669838133303e-06, + "loss": 0.3552, + "step": 584 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 3.565648317337036, + "learning_rate": 3.868118540071894e-06, + "loss": 0.4369, + "step": 585 + }, + { + "epoch": 3.1675675675675676, + "grad_norm": 3.5073986053466797, + "learning_rate": 3.8645632967115755e-06, + "loss": 0.3694, + "step": 586 + }, + { + "epoch": 3.172972972972973, + "grad_norm": 3.7636868953704834, + "learning_rate": 3.861004118304746e-06, + "loss": 0.3404, + "step": 587 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 2.940094232559204, + "learning_rate": 3.857441015115154e-06, + "loss": 0.3086, + "step": 588 + }, + { + "epoch": 3.1837837837837837, + "grad_norm": 3.727414608001709, + "learning_rate": 3.8538739974178635e-06, + "loss": 0.253, + "step": 589 + }, + { + "epoch": 3.189189189189189, + "grad_norm": 3.5140156745910645, + "learning_rate": 3.850303075499227e-06, + "loss": 0.2436, + "step": 590 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 3.545952558517456, + "learning_rate": 3.84672825965686e-06, + "loss": 0.328, + "step": 591 + }, + { + "epoch": 3.2, + "grad_norm": 3.534240484237671, + "learning_rate": 3.843149560199601e-06, + "loss": 0.2687, + "step": 592 + }, + { + "epoch": 3.2054054054054055, + "grad_norm": 2.8464927673339844, + "learning_rate": 3.839566987447492e-06, + "loss": 0.1417, + "step": 593 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 4.138559818267822, + "learning_rate": 3.835980551731743e-06, + "loss": 0.2106, + "step": 594 + }, + { + "epoch": 3.2162162162162162, + "grad_norm": 2.917670249938965, + "learning_rate": 3.8323902633947045e-06, + "loss": 0.3154, + "step": 595 + }, + { + "epoch": 3.2216216216216216, + "grad_norm": 3.029660224914551, + "learning_rate": 3.828796132789835e-06, + "loss": 0.1218, + "step": 596 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 3.2845771312713623, + "learning_rate": 3.825198170281677e-06, + "loss": 0.1336, + "step": 597 + }, + { + "epoch": 3.2324324324324323, + "grad_norm": 3.1375670433044434, + "learning_rate": 3.821596386245819e-06, + "loss": 0.2518, + "step": 598 + }, + { + "epoch": 3.237837837837838, + "grad_norm": 3.0021941661834717, + "learning_rate": 3.817990791068874e-06, + "loss": 0.2762, + "step": 599 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 4.141000747680664, + "learning_rate": 3.81438139514844e-06, + "loss": 0.2722, + "step": 600 + }, + { + "epoch": 3.2486486486486488, + "grad_norm": 3.9065279960632324, + "learning_rate": 3.8107682088930797e-06, + "loss": 0.3542, + "step": 601 + }, + { + "epoch": 3.254054054054054, + "grad_norm": 3.718417167663574, + "learning_rate": 3.807151242722286e-06, + "loss": 0.344, + "step": 602 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 4.013717174530029, + "learning_rate": 3.8035305070664484e-06, + "loss": 0.1625, + "step": 603 + }, + { + "epoch": 3.264864864864865, + "grad_norm": 3.348888397216797, + "learning_rate": 3.7999060123668318e-06, + "loss": 0.2925, + "step": 604 + }, + { + "epoch": 3.27027027027027, + "grad_norm": 3.496079206466675, + "learning_rate": 3.7962777690755364e-06, + "loss": 0.1523, + "step": 605 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 3.07607102394104, + "learning_rate": 3.792645787655476e-06, + "loss": 0.1674, + "step": 606 + }, + { + "epoch": 3.281081081081081, + "grad_norm": 3.4036154747009277, + "learning_rate": 3.7890100785803425e-06, + "loss": 0.2856, + "step": 607 + }, + { + "epoch": 3.2864864864864867, + "grad_norm": 6.092559337615967, + "learning_rate": 3.785370652334577e-06, + "loss": 0.1094, + "step": 608 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 3.9322001934051514, + "learning_rate": 3.7817275194133403e-06, + "loss": 0.2611, + "step": 609 + }, + { + "epoch": 3.2972972972972974, + "grad_norm": 3.189563274383545, + "learning_rate": 3.778080690322483e-06, + "loss": 0.1315, + "step": 610 + }, + { + "epoch": 3.3027027027027027, + "grad_norm": 4.304934024810791, + "learning_rate": 3.774430175578514e-06, + "loss": 0.1686, + "step": 611 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 2.9030067920684814, + "learning_rate": 3.7707759857085706e-06, + "loss": 0.4642, + "step": 612 + }, + { + "epoch": 3.3135135135135134, + "grad_norm": 3.7485930919647217, + "learning_rate": 3.7671181312503886e-06, + "loss": 0.1987, + "step": 613 + }, + { + "epoch": 3.3189189189189188, + "grad_norm": 3.4700896739959717, + "learning_rate": 3.763456622752271e-06, + "loss": 0.3307, + "step": 614 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 3.0079376697540283, + "learning_rate": 3.7597914707730583e-06, + "loss": 0.1731, + "step": 615 + }, + { + "epoch": 3.32972972972973, + "grad_norm": 3.155235767364502, + "learning_rate": 3.7561226858820984e-06, + "loss": 0.2003, + "step": 616 + }, + { + "epoch": 3.3351351351351353, + "grad_norm": 3.847895622253418, + "learning_rate": 3.7524502786592143e-06, + "loss": 0.4014, + "step": 617 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 2.7505502700805664, + "learning_rate": 3.7487742596946753e-06, + "loss": 0.205, + "step": 618 + }, + { + "epoch": 3.345945945945946, + "grad_norm": 3.654529571533203, + "learning_rate": 3.7450946395891674e-06, + "loss": 0.2932, + "step": 619 + }, + { + "epoch": 3.3513513513513513, + "grad_norm": 2.9763967990875244, + "learning_rate": 3.7414114289537593e-06, + "loss": 0.2748, + "step": 620 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 3.889683961868286, + "learning_rate": 3.7377246384098763e-06, + "loss": 0.3665, + "step": 621 + }, + { + "epoch": 3.362162162162162, + "grad_norm": 4.193166732788086, + "learning_rate": 3.7340342785892645e-06, + "loss": 0.3453, + "step": 622 + }, + { + "epoch": 3.3675675675675674, + "grad_norm": 3.4371488094329834, + "learning_rate": 3.7303403601339646e-06, + "loss": 0.473, + "step": 623 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 3.6939027309417725, + "learning_rate": 3.726642893696279e-06, + "loss": 0.3017, + "step": 624 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 4.904304504394531, + "learning_rate": 3.7229418899387414e-06, + "loss": 0.4841, + "step": 625 + }, + { + "epoch": 3.383783783783784, + "grad_norm": 3.6373438835144043, + "learning_rate": 3.719237359534087e-06, + "loss": 0.3879, + "step": 626 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 3.403676986694336, + "learning_rate": 3.71552931316522e-06, + "loss": 0.3876, + "step": 627 + }, + { + "epoch": 3.3945945945945946, + "grad_norm": 3.2292237281799316, + "learning_rate": 3.7118177615251834e-06, + "loss": 0.4491, + "step": 628 + }, + { + "epoch": 3.4, + "grad_norm": 3.317850351333618, + "learning_rate": 3.70810271531713e-06, + "loss": 0.3763, + "step": 629 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 3.664735794067383, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4171, + "step": 630 + }, + { + "epoch": 3.410810810810811, + "grad_norm": 3.781569242477417, + "learning_rate": 3.700662182059936e-06, + "loss": 0.2445, + "step": 631 + }, + { + "epoch": 3.4162162162162164, + "grad_norm": 2.878260850906372, + "learning_rate": 3.696936716467363e-06, + "loss": 0.1347, + "step": 632 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 2.8670761585235596, + "learning_rate": 3.693207799219846e-06, + "loss": 0.2822, + "step": 633 + }, + { + "epoch": 3.427027027027027, + "grad_norm": 3.9338245391845703, + "learning_rate": 3.689475441070615e-06, + "loss": 0.3425, + "step": 634 + }, + { + "epoch": 3.4324324324324325, + "grad_norm": 3.3172149658203125, + "learning_rate": 3.685739652782822e-06, + "loss": 0.3315, + "step": 635 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 3.9986648559570312, + "learning_rate": 3.682000445129512e-06, + "loss": 0.1841, + "step": 636 + }, + { + "epoch": 3.443243243243243, + "grad_norm": 3.4503986835479736, + "learning_rate": 3.6782578288935896e-06, + "loss": 0.3151, + "step": 637 + }, + { + "epoch": 3.4486486486486485, + "grad_norm": 3.8826167583465576, + "learning_rate": 3.6745118148677882e-06, + "loss": 0.1272, + "step": 638 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 3.0585904121398926, + "learning_rate": 3.6707624138546414e-06, + "loss": 0.2436, + "step": 639 + }, + { + "epoch": 3.4594594594594597, + "grad_norm": 3.8409557342529297, + "learning_rate": 3.6670096366664477e-06, + "loss": 0.6321, + "step": 640 + }, + { + "epoch": 3.464864864864865, + "grad_norm": 3.7260093688964844, + "learning_rate": 3.663253494125244e-06, + "loss": 0.1262, + "step": 641 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 3.195587396621704, + "learning_rate": 3.6594939970627706e-06, + "loss": 0.2669, + "step": 642 + }, + { + "epoch": 3.4756756756756757, + "grad_norm": 2.565070629119873, + "learning_rate": 3.655731156320441e-06, + "loss": 0.1228, + "step": 643 + }, + { + "epoch": 3.481081081081081, + "grad_norm": 3.745422124862671, + "learning_rate": 3.651964982749312e-06, + "loss": 0.1759, + "step": 644 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 4.96168327331543, + "learning_rate": 3.648195487210051e-06, + "loss": 0.5677, + "step": 645 + }, + { + "epoch": 3.4918918918918918, + "grad_norm": 3.514446496963501, + "learning_rate": 3.644422680572906e-06, + "loss": 0.1874, + "step": 646 + }, + { + "epoch": 3.4972972972972975, + "grad_norm": 3.1427719593048096, + "learning_rate": 3.640646573717671e-06, + "loss": 0.3225, + "step": 647 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 3.32208514213562, + "learning_rate": 3.63686717753366e-06, + "loss": 0.102, + "step": 648 + }, + { + "epoch": 3.5081081081081082, + "grad_norm": 3.409299373626709, + "learning_rate": 3.6330845029196697e-06, + "loss": 0.1585, + "step": 649 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 2.827052116394043, + "learning_rate": 3.629298560783952e-06, + "loss": 0.3046, + "step": 650 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 3.541518211364746, + "learning_rate": 3.6255093620441835e-06, + "loss": 0.2037, + "step": 651 + }, + { + "epoch": 3.5243243243243243, + "grad_norm": 3.067040205001831, + "learning_rate": 3.6217169176274293e-06, + "loss": 0.1784, + "step": 652 + }, + { + "epoch": 3.5297297297297296, + "grad_norm": 4.001040935516357, + "learning_rate": 3.6179212384701146e-06, + "loss": 0.1974, + "step": 653 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 4.03037691116333, + "learning_rate": 3.6141223355179946e-06, + "loss": 0.2161, + "step": 654 + }, + { + "epoch": 3.5405405405405403, + "grad_norm": 3.303591728210449, + "learning_rate": 3.610320219726118e-06, + "loss": 0.1487, + "step": 655 + }, + { + "epoch": 3.545945945945946, + "grad_norm": 4.183008193969727, + "learning_rate": 3.606514902058802e-06, + "loss": 0.2231, + "step": 656 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 4.2100300788879395, + "learning_rate": 3.602706393489594e-06, + "loss": 0.5068, + "step": 657 + }, + { + "epoch": 3.556756756756757, + "grad_norm": 4.521003246307373, + "learning_rate": 3.598894705001246e-06, + "loss": 0.4621, + "step": 658 + }, + { + "epoch": 3.562162162162162, + "grad_norm": 3.452348470687866, + "learning_rate": 3.5950798475856783e-06, + "loss": 0.285, + "step": 659 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 3.468987464904785, + "learning_rate": 3.5912618322439487e-06, + "loss": 0.4277, + "step": 660 + }, + { + "epoch": 3.572972972972973, + "grad_norm": 3.431551933288574, + "learning_rate": 3.587440669986224e-06, + "loss": 0.1993, + "step": 661 + }, + { + "epoch": 3.5783783783783782, + "grad_norm": 3.017648220062256, + "learning_rate": 3.5836163718317453e-06, + "loss": 0.272, + "step": 662 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 3.837244987487793, + "learning_rate": 3.5797889488087946e-06, + "loss": 0.6019, + "step": 663 + }, + { + "epoch": 3.589189189189189, + "grad_norm": 3.221762180328369, + "learning_rate": 3.575958411954668e-06, + "loss": 0.3603, + "step": 664 + }, + { + "epoch": 3.5945945945945947, + "grad_norm": 4.279484272003174, + "learning_rate": 3.5721247723156393e-06, + "loss": 0.4656, + "step": 665 + }, + { + "epoch": 3.6, + "grad_norm": 3.723459243774414, + "learning_rate": 3.5682880409469316e-06, + "loss": 0.2466, + "step": 666 + }, + { + "epoch": 3.6054054054054054, + "grad_norm": 2.7260632514953613, + "learning_rate": 3.564448228912682e-06, + "loss": 0.1848, + "step": 667 + }, + { + "epoch": 3.610810810810811, + "grad_norm": 3.6656649112701416, + "learning_rate": 3.5606053472859124e-06, + "loss": 0.4968, + "step": 668 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 4.570294380187988, + "learning_rate": 3.556759407148496e-06, + "loss": 0.316, + "step": 669 + }, + { + "epoch": 3.6216216216216215, + "grad_norm": 3.174433946609497, + "learning_rate": 3.5529104195911258e-06, + "loss": 0.2232, + "step": 670 + }, + { + "epoch": 3.627027027027027, + "grad_norm": 4.481954574584961, + "learning_rate": 3.549058395713285e-06, + "loss": 0.4435, + "step": 671 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 3.8758301734924316, + "learning_rate": 3.54520334662321e-06, + "loss": 0.1455, + "step": 672 + }, + { + "epoch": 3.637837837837838, + "grad_norm": 3.1699628829956055, + "learning_rate": 3.5413452834378626e-06, + "loss": 0.3037, + "step": 673 + }, + { + "epoch": 3.6432432432432433, + "grad_norm": 3.8971962928771973, + "learning_rate": 3.5374842172828953e-06, + "loss": 0.4309, + "step": 674 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 3.3087549209594727, + "learning_rate": 3.533620159292621e-06, + "loss": 0.383, + "step": 675 + }, + { + "epoch": 3.654054054054054, + "grad_norm": 2.9413082599639893, + "learning_rate": 3.529753120609982e-06, + "loss": 0.1963, + "step": 676 + }, + { + "epoch": 3.6594594594594594, + "grad_norm": 3.309837818145752, + "learning_rate": 3.5258831123865136e-06, + "loss": 0.1922, + "step": 677 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 4.124879360198975, + "learning_rate": 3.5220101457823147e-06, + "loss": 0.5589, + "step": 678 + }, + { + "epoch": 3.6702702702702705, + "grad_norm": 3.2587103843688965, + "learning_rate": 3.5181342319660174e-06, + "loss": 0.1757, + "step": 679 + }, + { + "epoch": 3.6756756756756754, + "grad_norm": 4.179666042327881, + "learning_rate": 3.5142553821147498e-06, + "loss": 0.1208, + "step": 680 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 3.4041192531585693, + "learning_rate": 3.5103736074141106e-06, + "loss": 0.2416, + "step": 681 + }, + { + "epoch": 3.6864864864864866, + "grad_norm": 4.982706546783447, + "learning_rate": 3.5064889190581293e-06, + "loss": 0.3841, + "step": 682 + }, + { + "epoch": 3.691891891891892, + "grad_norm": 3.5895309448242188, + "learning_rate": 3.5026013282492406e-06, + "loss": 0.3723, + "step": 683 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 3.4824306964874268, + "learning_rate": 3.498710846198247e-06, + "loss": 0.4403, + "step": 684 + }, + { + "epoch": 3.7027027027027026, + "grad_norm": 3.501023054122925, + "learning_rate": 3.494817484124289e-06, + "loss": 0.2813, + "step": 685 + }, + { + "epoch": 3.708108108108108, + "grad_norm": 3.934908151626587, + "learning_rate": 3.490921253254813e-06, + "loss": 0.4287, + "step": 686 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 3.24141526222229, + "learning_rate": 3.487022164825539e-06, + "loss": 0.234, + "step": 687 + }, + { + "epoch": 3.718918918918919, + "grad_norm": 3.3419880867004395, + "learning_rate": 3.4831202300804246e-06, + "loss": 0.2135, + "step": 688 + }, + { + "epoch": 3.7243243243243245, + "grad_norm": 3.923778772354126, + "learning_rate": 3.479215460271638e-06, + "loss": 0.2725, + "step": 689 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 3.2432096004486084, + "learning_rate": 3.475307866659522e-06, + "loss": 0.228, + "step": 690 + }, + { + "epoch": 3.735135135135135, + "grad_norm": 3.0307705402374268, + "learning_rate": 3.4713974605125634e-06, + "loss": 0.0985, + "step": 691 + }, + { + "epoch": 3.7405405405405405, + "grad_norm": 2.778942346572876, + "learning_rate": 3.4674842531073587e-06, + "loss": 0.2137, + "step": 692 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 3.711315155029297, + "learning_rate": 3.4635682557285833e-06, + "loss": 0.1707, + "step": 693 + }, + { + "epoch": 3.7513513513513512, + "grad_norm": 3.165668487548828, + "learning_rate": 3.459649479668956e-06, + "loss": 0.3021, + "step": 694 + }, + { + "epoch": 3.756756756756757, + "grad_norm": 3.7491254806518555, + "learning_rate": 3.4557279362292117e-06, + "loss": 0.3457, + "step": 695 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 3.271603584289551, + "learning_rate": 3.451803636718064e-06, + "loss": 0.1193, + "step": 696 + }, + { + "epoch": 3.7675675675675677, + "grad_norm": 3.872382402420044, + "learning_rate": 3.447876592452174e-06, + "loss": 0.2261, + "step": 697 + }, + { + "epoch": 3.772972972972973, + "grad_norm": 4.634008407592773, + "learning_rate": 3.4439468147561196e-06, + "loss": 0.5042, + "step": 698 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 3.6930148601531982, + "learning_rate": 3.440014314962358e-06, + "loss": 0.3481, + "step": 699 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 4.709466457366943, + "learning_rate": 3.4360791044112e-06, + "loss": 0.2317, + "step": 700 + }, + { + "epoch": 3.789189189189189, + "grad_norm": 4.37923002243042, + "learning_rate": 3.432141194450772e-06, + "loss": 0.395, + "step": 701 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 3.1600489616394043, + "learning_rate": 3.4282005964369836e-06, + "loss": 0.1767, + "step": 702 + }, + { + "epoch": 3.8, + "grad_norm": 3.9799487590789795, + "learning_rate": 3.424257321733497e-06, + "loss": 0.2146, + "step": 703 + }, + { + "epoch": 3.8054054054054056, + "grad_norm": 2.79176664352417, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.1534, + "step": 704 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 3.0024254322052, + "learning_rate": 3.4163627877506434e-06, + "loss": 0.2513, + "step": 705 + }, + { + "epoch": 3.8162162162162163, + "grad_norm": 2.924475908279419, + "learning_rate": 3.4124115512370636e-06, + "loss": 0.4154, + "step": 706 + }, + { + "epoch": 3.8216216216216217, + "grad_norm": 3.2713992595672607, + "learning_rate": 3.408457683565295e-06, + "loss": 0.1822, + "step": 707 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 3.094003438949585, + "learning_rate": 3.4045011961372675e-06, + "loss": 0.3589, + "step": 708 + }, + { + "epoch": 3.8324324324324324, + "grad_norm": 3.423858404159546, + "learning_rate": 3.4005421003624637e-06, + "loss": 0.4615, + "step": 709 + }, + { + "epoch": 3.8378378378378377, + "grad_norm": 2.038792848587036, + "learning_rate": 3.3965804076578896e-06, + "loss": 0.1001, + "step": 710 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 2.6447055339813232, + "learning_rate": 3.392616129448039e-06, + "loss": 0.2788, + "step": 711 + }, + { + "epoch": 3.8486486486486484, + "grad_norm": 3.546876907348633, + "learning_rate": 3.3886492771648593e-06, + "loss": 0.2663, + "step": 712 + }, + { + "epoch": 3.854054054054054, + "grad_norm": 2.9587066173553467, + "learning_rate": 3.384679862247726e-06, + "loss": 0.3497, + "step": 713 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 3.7122113704681396, + "learning_rate": 3.3807078961434013e-06, + "loss": 0.3613, + "step": 714 + }, + { + "epoch": 3.864864864864865, + "grad_norm": 3.157294988632202, + "learning_rate": 3.376733390306004e-06, + "loss": 0.0783, + "step": 715 + }, + { + "epoch": 3.8702702702702703, + "grad_norm": 3.564279317855835, + "learning_rate": 3.372756356196979e-06, + "loss": 0.1617, + "step": 716 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 4.231864929199219, + "learning_rate": 3.3687768052850595e-06, + "loss": 0.6444, + "step": 717 + }, + { + "epoch": 3.881081081081081, + "grad_norm": 5.480365753173828, + "learning_rate": 3.364794749046239e-06, + "loss": 0.4858, + "step": 718 + }, + { + "epoch": 3.8864864864864863, + "grad_norm": 3.428140878677368, + "learning_rate": 3.3608101989637333e-06, + "loss": 0.3103, + "step": 719 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 3.521989345550537, + "learning_rate": 3.356823166527952e-06, + "loss": 0.2501, + "step": 720 + }, + { + "epoch": 3.8972972972972975, + "grad_norm": 3.287081718444824, + "learning_rate": 3.352833663236463e-06, + "loss": 0.18, + "step": 721 + }, + { + "epoch": 3.902702702702703, + "grad_norm": 3.323146104812622, + "learning_rate": 3.348841700593956e-06, + "loss": 0.12, + "step": 722 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 3.516693115234375, + "learning_rate": 3.3448472901122187e-06, + "loss": 0.2618, + "step": 723 + }, + { + "epoch": 3.9135135135135135, + "grad_norm": 3.8109545707702637, + "learning_rate": 3.340850443310092e-06, + "loss": 0.3689, + "step": 724 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 3.8335933685302734, + "learning_rate": 3.336851171713447e-06, + "loss": 0.2195, + "step": 725 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 3.9054670333862305, + "learning_rate": 3.3328494868551444e-06, + "loss": 0.2602, + "step": 726 + }, + { + "epoch": 3.92972972972973, + "grad_norm": 3.1380631923675537, + "learning_rate": 3.3288454002750046e-06, + "loss": 0.1561, + "step": 727 + }, + { + "epoch": 3.935135135135135, + "grad_norm": 4.304198741912842, + "learning_rate": 3.3248389235197764e-06, + "loss": 0.4469, + "step": 728 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 3.3321573734283447, + "learning_rate": 3.3208300681430967e-06, + "loss": 0.2246, + "step": 729 + }, + { + "epoch": 3.945945945945946, + "grad_norm": 3.89400315284729, + "learning_rate": 3.3168188457054656e-06, + "loss": 0.2743, + "step": 730 + }, + { + "epoch": 3.9513513513513514, + "grad_norm": 3.393209934234619, + "learning_rate": 3.312805267774209e-06, + "loss": 0.551, + "step": 731 + }, + { + "epoch": 3.9567567567567568, + "grad_norm": 3.711652994155884, + "learning_rate": 3.3087893459234423e-06, + "loss": 0.3522, + "step": 732 + }, + { + "epoch": 3.962162162162162, + "grad_norm": 3.6701200008392334, + "learning_rate": 3.304771091734043e-06, + "loss": 0.3084, + "step": 733 + }, + { + "epoch": 3.9675675675675675, + "grad_norm": 3.1742889881134033, + "learning_rate": 3.300750516793614e-06, + "loss": 0.3406, + "step": 734 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 4.000397682189941, + "learning_rate": 3.2967276326964504e-06, + "loss": 0.3463, + "step": 735 + }, + { + "epoch": 3.9783783783783786, + "grad_norm": 3.7932708263397217, + "learning_rate": 3.2927024510435057e-06, + "loss": 0.3758, + "step": 736 + }, + { + "epoch": 3.983783783783784, + "grad_norm": 3.6258292198181152, + "learning_rate": 3.2886749834423587e-06, + "loss": 0.3328, + "step": 737 + }, + { + "epoch": 3.9891891891891893, + "grad_norm": 4.628194332122803, + "learning_rate": 3.284645241507183e-06, + "loss": 0.6213, + "step": 738 + }, + { + "epoch": 3.9945945945945946, + "grad_norm": 4.173697471618652, + "learning_rate": 3.280613236858707e-06, + "loss": 0.2463, + "step": 739 + }, + { + "epoch": 4.0, + "grad_norm": 2.9315719604492188, + "learning_rate": 3.2765789811241865e-06, + "loss": 0.3501, + "step": 740 + }, + { + "epoch": 4.005405405405406, + "grad_norm": 3.7292938232421875, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1753, + "step": 741 + }, + { + "epoch": 4.010810810810811, + "grad_norm": 3.627298593521118, + "learning_rate": 3.2685037629384587e-06, + "loss": 0.0722, + "step": 742 + }, + { + "epoch": 4.0162162162162165, + "grad_norm": 3.7558975219726562, + "learning_rate": 3.264462823774085e-06, + "loss": 0.2475, + "step": 743 + }, + { + "epoch": 4.021621621621621, + "grad_norm": 2.991217851638794, + "learning_rate": 3.260419680097268e-06, + "loss": 0.1163, + "step": 744 + }, + { + "epoch": 4.027027027027027, + "grad_norm": 3.315901517868042, + "learning_rate": 3.2563743435673855e-06, + "loss": 0.1325, + "step": 745 + }, + { + "epoch": 4.032432432432432, + "grad_norm": 2.9405429363250732, + "learning_rate": 3.252326825850139e-06, + "loss": 0.0466, + "step": 746 + }, + { + "epoch": 4.037837837837838, + "grad_norm": 4.078726291656494, + "learning_rate": 3.2482771386175173e-06, + "loss": 0.1861, + "step": 747 + }, + { + "epoch": 4.043243243243243, + "grad_norm": 3.6752545833587646, + "learning_rate": 3.24422529354777e-06, + "loss": 0.1637, + "step": 748 + }, + { + "epoch": 4.048648648648649, + "grad_norm": 4.471213340759277, + "learning_rate": 3.2401713023253646e-06, + "loss": 0.1379, + "step": 749 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 4.609938144683838, + "learning_rate": 3.2361151766409628e-06, + "loss": 0.1099, + "step": 750 + }, + { + "epoch": 4.059459459459459, + "grad_norm": 3.7480030059814453, + "learning_rate": 3.232056928191376e-06, + "loss": 0.1422, + "step": 751 + }, + { + "epoch": 4.064864864864865, + "grad_norm": 4.23753547668457, + "learning_rate": 3.2279965686795424e-06, + "loss": 0.2716, + "step": 752 + }, + { + "epoch": 4.07027027027027, + "grad_norm": 4.59039306640625, + "learning_rate": 3.2239341098144833e-06, + "loss": 0.3849, + "step": 753 + }, + { + "epoch": 4.075675675675676, + "grad_norm": 2.9332475662231445, + "learning_rate": 3.219869563311277e-06, + "loss": 0.0768, + "step": 754 + }, + { + "epoch": 4.081081081081081, + "grad_norm": 3.8387272357940674, + "learning_rate": 3.2158029408910213e-06, + "loss": 0.112, + "step": 755 + }, + { + "epoch": 4.0864864864864865, + "grad_norm": 2.5676164627075195, + "learning_rate": 3.2117342542807995e-06, + "loss": 0.1054, + "step": 756 + }, + { + "epoch": 4.091891891891892, + "grad_norm": 3.4695913791656494, + "learning_rate": 3.207663515213648e-06, + "loss": 0.1754, + "step": 757 + }, + { + "epoch": 4.097297297297297, + "grad_norm": 3.531060218811035, + "learning_rate": 3.2035907354285234e-06, + "loss": 0.191, + "step": 758 + }, + { + "epoch": 4.102702702702703, + "grad_norm": 3.8944122791290283, + "learning_rate": 3.1995159266702648e-06, + "loss": 0.1083, + "step": 759 + }, + { + "epoch": 4.108108108108108, + "grad_norm": 3.572751998901367, + "learning_rate": 3.1954391006895635e-06, + "loss": 0.0609, + "step": 760 + }, + { + "epoch": 4.113513513513514, + "grad_norm": 3.533867120742798, + "learning_rate": 3.191360269242928e-06, + "loss": 0.049, + "step": 761 + }, + { + "epoch": 4.118918918918919, + "grad_norm": 3.742013454437256, + "learning_rate": 3.18727944409265e-06, + "loss": 0.1642, + "step": 762 + }, + { + "epoch": 4.124324324324324, + "grad_norm": 3.918525457382202, + "learning_rate": 3.1831966370067714e-06, + "loss": 0.1513, + "step": 763 + }, + { + "epoch": 4.12972972972973, + "grad_norm": 4.906899929046631, + "learning_rate": 3.1791118597590467e-06, + "loss": 0.3276, + "step": 764 + }, + { + "epoch": 4.135135135135135, + "grad_norm": 5.704930305480957, + "learning_rate": 3.1750251241289148e-06, + "loss": 0.4011, + "step": 765 + }, + { + "epoch": 4.140540540540541, + "grad_norm": 4.278724193572998, + "learning_rate": 3.1709364419014615e-06, + "loss": 0.2274, + "step": 766 + }, + { + "epoch": 4.145945945945946, + "grad_norm": 3.7831263542175293, + "learning_rate": 3.166845824867384e-06, + "loss": 0.118, + "step": 767 + }, + { + "epoch": 4.151351351351352, + "grad_norm": 3.6355350017547607, + "learning_rate": 3.162753284822962e-06, + "loss": 0.1109, + "step": 768 + }, + { + "epoch": 4.1567567567567565, + "grad_norm": 4.063662052154541, + "learning_rate": 3.1586588335700176e-06, + "loss": 0.1754, + "step": 769 + }, + { + "epoch": 4.162162162162162, + "grad_norm": 3.404348611831665, + "learning_rate": 3.1545624829158873e-06, + "loss": 0.1155, + "step": 770 + }, + { + "epoch": 4.167567567567567, + "grad_norm": 2.7452480792999268, + "learning_rate": 3.1504642446733828e-06, + "loss": 0.0635, + "step": 771 + }, + { + "epoch": 4.172972972972973, + "grad_norm": 2.4755163192749023, + "learning_rate": 3.146364130660761e-06, + "loss": 0.1068, + "step": 772 + }, + { + "epoch": 4.178378378378379, + "grad_norm": 3.0338311195373535, + "learning_rate": 3.142262152701685e-06, + "loss": 0.0637, + "step": 773 + }, + { + "epoch": 4.183783783783784, + "grad_norm": 4.566886901855469, + "learning_rate": 3.138158322625197e-06, + "loss": 0.2703, + "step": 774 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 4.614205360412598, + "learning_rate": 3.1340526522656765e-06, + "loss": 0.2769, + "step": 775 + }, + { + "epoch": 4.194594594594594, + "grad_norm": 3.4197700023651123, + "learning_rate": 3.1299451534628134e-06, + "loss": 0.1192, + "step": 776 + }, + { + "epoch": 4.2, + "grad_norm": 3.2838752269744873, + "learning_rate": 3.1258358380615674e-06, + "loss": 0.1244, + "step": 777 + }, + { + "epoch": 4.205405405405405, + "grad_norm": 4.484423637390137, + "learning_rate": 3.121724717912138e-06, + "loss": 0.2819, + "step": 778 + }, + { + "epoch": 4.210810810810811, + "grad_norm": 2.6898670196533203, + "learning_rate": 3.1176118048699283e-06, + "loss": 0.1018, + "step": 779 + }, + { + "epoch": 4.216216216216216, + "grad_norm": 3.3304710388183594, + "learning_rate": 3.113497110795514e-06, + "loss": 0.1842, + "step": 780 + }, + { + "epoch": 4.221621621621622, + "grad_norm": 3.29425311088562, + "learning_rate": 3.1093806475546046e-06, + "loss": 0.2299, + "step": 781 + }, + { + "epoch": 4.227027027027027, + "grad_norm": 3.0818686485290527, + "learning_rate": 3.1052624270180116e-06, + "loss": 0.1397, + "step": 782 + }, + { + "epoch": 4.232432432432432, + "grad_norm": 4.569559097290039, + "learning_rate": 3.1011424610616153e-06, + "loss": 0.2236, + "step": 783 + }, + { + "epoch": 4.237837837837838, + "grad_norm": 3.2377943992614746, + "learning_rate": 3.097020761566328e-06, + "loss": 0.1417, + "step": 784 + }, + { + "epoch": 4.243243243243243, + "grad_norm": 5.442404270172119, + "learning_rate": 3.092897340418062e-06, + "loss": 0.1317, + "step": 785 + }, + { + "epoch": 4.248648648648649, + "grad_norm": 4.14007568359375, + "learning_rate": 3.088772209507694e-06, + "loss": 0.1869, + "step": 786 + }, + { + "epoch": 4.254054054054054, + "grad_norm": 3.024740695953369, + "learning_rate": 3.0846453807310317e-06, + "loss": 0.0967, + "step": 787 + }, + { + "epoch": 4.2594594594594595, + "grad_norm": 3.463261365890503, + "learning_rate": 3.080516865988778e-06, + "loss": 0.0731, + "step": 788 + }, + { + "epoch": 4.264864864864865, + "grad_norm": 3.398139715194702, + "learning_rate": 3.076386677186498e-06, + "loss": 0.1912, + "step": 789 + }, + { + "epoch": 4.27027027027027, + "grad_norm": 3.934204339981079, + "learning_rate": 3.0722548262345854e-06, + "loss": 0.2133, + "step": 790 + }, + { + "epoch": 4.275675675675676, + "grad_norm": 5.5322041511535645, + "learning_rate": 3.0681213250482255e-06, + "loss": 0.4454, + "step": 791 + }, + { + "epoch": 4.281081081081081, + "grad_norm": 5.381092071533203, + "learning_rate": 3.0639861855473637e-06, + "loss": 0.3645, + "step": 792 + }, + { + "epoch": 4.286486486486487, + "grad_norm": 4.104682445526123, + "learning_rate": 3.05984941965667e-06, + "loss": 0.1331, + "step": 793 + }, + { + "epoch": 4.291891891891892, + "grad_norm": 3.032749652862549, + "learning_rate": 3.055711039305503e-06, + "loss": 0.0863, + "step": 794 + }, + { + "epoch": 4.297297297297297, + "grad_norm": 3.1181957721710205, + "learning_rate": 3.051571056427879e-06, + "loss": 0.1988, + "step": 795 + }, + { + "epoch": 4.302702702702703, + "grad_norm": 4.8824944496154785, + "learning_rate": 3.047429482962433e-06, + "loss": 0.2307, + "step": 796 + }, + { + "epoch": 4.308108108108108, + "grad_norm": 3.5564794540405273, + "learning_rate": 3.0432863308523903e-06, + "loss": 0.1614, + "step": 797 + }, + { + "epoch": 4.313513513513514, + "grad_norm": 2.928267240524292, + "learning_rate": 3.039141612045525e-06, + "loss": 0.0683, + "step": 798 + }, + { + "epoch": 4.318918918918919, + "grad_norm": 2.846242666244507, + "learning_rate": 3.034995338494131e-06, + "loss": 0.1784, + "step": 799 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 2.8273985385894775, + "learning_rate": 3.0308475221549868e-06, + "loss": 0.0451, + "step": 800 + }, + { + "epoch": 4.3297297297297295, + "grad_norm": 3.0229880809783936, + "learning_rate": 3.026698174989316e-06, + "loss": 0.0618, + "step": 801 + }, + { + "epoch": 4.335135135135135, + "grad_norm": 3.555338144302368, + "learning_rate": 3.0225473089627617e-06, + "loss": 0.1529, + "step": 802 + }, + { + "epoch": 4.34054054054054, + "grad_norm": 3.7206318378448486, + "learning_rate": 3.0183949360453442e-06, + "loss": 0.4177, + "step": 803 + }, + { + "epoch": 4.345945945945946, + "grad_norm": 4.038993835449219, + "learning_rate": 3.014241068211428e-06, + "loss": 0.1394, + "step": 804 + }, + { + "epoch": 4.351351351351352, + "grad_norm": 3.723766565322876, + "learning_rate": 3.0100857174396926e-06, + "loss": 0.04, + "step": 805 + }, + { + "epoch": 4.356756756756757, + "grad_norm": 4.745445728302002, + "learning_rate": 3.0059288957130893e-06, + "loss": 0.2705, + "step": 806 + }, + { + "epoch": 4.3621621621621625, + "grad_norm": 3.245249032974243, + "learning_rate": 3.001770615018815e-06, + "loss": 0.2208, + "step": 807 + }, + { + "epoch": 4.367567567567567, + "grad_norm": 4.631863594055176, + "learning_rate": 2.9976108873482725e-06, + "loss": 0.2068, + "step": 808 + }, + { + "epoch": 4.372972972972973, + "grad_norm": 3.4944963455200195, + "learning_rate": 2.9934497246970357e-06, + "loss": 0.1253, + "step": 809 + }, + { + "epoch": 4.378378378378378, + "grad_norm": 3.393252372741699, + "learning_rate": 2.989287139064819e-06, + "loss": 0.1721, + "step": 810 + }, + { + "epoch": 4.383783783783784, + "grad_norm": 3.2354531288146973, + "learning_rate": 2.9851231424554385e-06, + "loss": 0.134, + "step": 811 + }, + { + "epoch": 4.389189189189189, + "grad_norm": 3.8997225761413574, + "learning_rate": 2.9809577468767813e-06, + "loss": 0.0818, + "step": 812 + }, + { + "epoch": 4.394594594594595, + "grad_norm": 3.4745192527770996, + "learning_rate": 2.9767909643407676e-06, + "loss": 0.1797, + "step": 813 + }, + { + "epoch": 4.4, + "grad_norm": 2.8166556358337402, + "learning_rate": 2.9726228068633155e-06, + "loss": 0.145, + "step": 814 + }, + { + "epoch": 4.405405405405405, + "grad_norm": 3.4947283267974854, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.079, + "step": 815 + }, + { + "epoch": 4.410810810810811, + "grad_norm": 3.8058624267578125, + "learning_rate": 2.9642824151675702e-06, + "loss": 0.1763, + "step": 816 + }, + { + "epoch": 4.416216216216216, + "grad_norm": 3.161440134048462, + "learning_rate": 2.9601102050008016e-06, + "loss": 0.2654, + "step": 817 + }, + { + "epoch": 4.421621621621622, + "grad_norm": 2.7620294094085693, + "learning_rate": 2.955936667995578e-06, + "loss": 0.0779, + "step": 818 + }, + { + "epoch": 4.427027027027027, + "grad_norm": 3.2293593883514404, + "learning_rate": 2.9517618161872974e-06, + "loss": 0.0587, + "step": 819 + }, + { + "epoch": 4.4324324324324325, + "grad_norm": 2.753647565841675, + "learning_rate": 2.9475856616151487e-06, + "loss": 0.0835, + "step": 820 + }, + { + "epoch": 4.437837837837838, + "grad_norm": 3.744755744934082, + "learning_rate": 2.9434082163220773e-06, + "loss": 0.1748, + "step": 821 + }, + { + "epoch": 4.443243243243243, + "grad_norm": 3.5458850860595703, + "learning_rate": 2.9392294923547543e-06, + "loss": 0.119, + "step": 822 + }, + { + "epoch": 4.448648648648649, + "grad_norm": 4.037010192871094, + "learning_rate": 2.9350495017635334e-06, + "loss": 0.1535, + "step": 823 + }, + { + "epoch": 4.454054054054054, + "grad_norm": 3.704439401626587, + "learning_rate": 2.9308682566024228e-06, + "loss": 0.2561, + "step": 824 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 2.9537882804870605, + "learning_rate": 2.92668576892905e-06, + "loss": 0.2024, + "step": 825 + }, + { + "epoch": 4.464864864864865, + "grad_norm": 3.1923575401306152, + "learning_rate": 2.9225020508046233e-06, + "loss": 0.0436, + "step": 826 + }, + { + "epoch": 4.47027027027027, + "grad_norm": 3.304884195327759, + "learning_rate": 2.9183171142939002e-06, + "loss": 0.1636, + "step": 827 + }, + { + "epoch": 4.475675675675676, + "grad_norm": 3.5481832027435303, + "learning_rate": 2.9141309714651528e-06, + "loss": 0.0962, + "step": 828 + }, + { + "epoch": 4.481081081081081, + "grad_norm": 4.0650153160095215, + "learning_rate": 2.9099436343901306e-06, + "loss": 0.2129, + "step": 829 + }, + { + "epoch": 4.486486486486487, + "grad_norm": 4.274670124053955, + "learning_rate": 2.9057551151440266e-06, + "loss": 0.2872, + "step": 830 + }, + { + "epoch": 4.491891891891892, + "grad_norm": 4.45655632019043, + "learning_rate": 2.9015654258054433e-06, + "loss": 0.3254, + "step": 831 + }, + { + "epoch": 4.4972972972972975, + "grad_norm": 3.2205746173858643, + "learning_rate": 2.8973745784563596e-06, + "loss": 0.1417, + "step": 832 + }, + { + "epoch": 4.5027027027027025, + "grad_norm": 3.994489908218384, + "learning_rate": 2.8931825851820904e-06, + "loss": 0.2513, + "step": 833 + }, + { + "epoch": 4.508108108108108, + "grad_norm": 2.8250539302825928, + "learning_rate": 2.8889894580712574e-06, + "loss": 0.1785, + "step": 834 + }, + { + "epoch": 4.513513513513513, + "grad_norm": 3.526552200317383, + "learning_rate": 2.884795209215751e-06, + "loss": 0.2853, + "step": 835 + }, + { + "epoch": 4.518918918918919, + "grad_norm": 3.8975565433502197, + "learning_rate": 2.880599850710696e-06, + "loss": 0.2947, + "step": 836 + }, + { + "epoch": 4.524324324324324, + "grad_norm": 2.86104154586792, + "learning_rate": 2.8764033946544197e-06, + "loss": 0.177, + "step": 837 + }, + { + "epoch": 4.52972972972973, + "grad_norm": 3.967454433441162, + "learning_rate": 2.8722058531484105e-06, + "loss": 0.2786, + "step": 838 + }, + { + "epoch": 4.535135135135135, + "grad_norm": 3.9122490882873535, + "learning_rate": 2.86800723829729e-06, + "loss": 0.1881, + "step": 839 + }, + { + "epoch": 4.54054054054054, + "grad_norm": 3.9732089042663574, + "learning_rate": 2.8638075622087747e-06, + "loss": 0.3541, + "step": 840 + }, + { + "epoch": 4.545945945945946, + "grad_norm": 3.7056405544281006, + "learning_rate": 2.8596068369936386e-06, + "loss": 0.3094, + "step": 841 + }, + { + "epoch": 4.551351351351351, + "grad_norm": 3.5056777000427246, + "learning_rate": 2.8554050747656862e-06, + "loss": 0.1162, + "step": 842 + }, + { + "epoch": 4.556756756756757, + "grad_norm": 3.1131439208984375, + "learning_rate": 2.851202287641709e-06, + "loss": 0.1079, + "step": 843 + }, + { + "epoch": 4.562162162162162, + "grad_norm": 3.6517693996429443, + "learning_rate": 2.8469984877414525e-06, + "loss": 0.4462, + "step": 844 + }, + { + "epoch": 4.5675675675675675, + "grad_norm": 3.0627806186676025, + "learning_rate": 2.842793687187588e-06, + "loss": 0.0851, + "step": 845 + }, + { + "epoch": 4.572972972972973, + "grad_norm": 4.0370893478393555, + "learning_rate": 2.8385878981056663e-06, + "loss": 0.1268, + "step": 846 + }, + { + "epoch": 4.578378378378378, + "grad_norm": 3.486156463623047, + "learning_rate": 2.8343811326240944e-06, + "loss": 0.3187, + "step": 847 + }, + { + "epoch": 4.583783783783784, + "grad_norm": 2.4388604164123535, + "learning_rate": 2.830173402874091e-06, + "loss": 0.1315, + "step": 848 + }, + { + "epoch": 4.589189189189189, + "grad_norm": 3.5970475673675537, + "learning_rate": 2.8259647209896573e-06, + "loss": 0.301, + "step": 849 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 3.657775402069092, + "learning_rate": 2.821755099107541e-06, + "loss": 0.1478, + "step": 850 + }, + { + "epoch": 4.6, + "grad_norm": 3.2040653228759766, + "learning_rate": 2.817544549367197e-06, + "loss": 0.2029, + "step": 851 + }, + { + "epoch": 4.605405405405405, + "grad_norm": 2.778747081756592, + "learning_rate": 2.813333083910761e-06, + "loss": 0.0549, + "step": 852 + }, + { + "epoch": 4.610810810810811, + "grad_norm": 3.661921977996826, + "learning_rate": 2.8091207148830046e-06, + "loss": 0.1508, + "step": 853 + }, + { + "epoch": 4.616216216216216, + "grad_norm": 2.7028398513793945, + "learning_rate": 2.8049074544313094e-06, + "loss": 0.1094, + "step": 854 + }, + { + "epoch": 4.621621621621622, + "grad_norm": 3.3319056034088135, + "learning_rate": 2.8006933147056236e-06, + "loss": 0.0799, + "step": 855 + }, + { + "epoch": 4.627027027027027, + "grad_norm": 3.3194944858551025, + "learning_rate": 2.7964783078584336e-06, + "loss": 0.123, + "step": 856 + }, + { + "epoch": 4.632432432432433, + "grad_norm": 2.4618616104125977, + "learning_rate": 2.792262446044725e-06, + "loss": 0.0692, + "step": 857 + }, + { + "epoch": 4.6378378378378375, + "grad_norm": 4.007084846496582, + "learning_rate": 2.788045741421949e-06, + "loss": 0.1596, + "step": 858 + }, + { + "epoch": 4.643243243243243, + "grad_norm": 2.6852214336395264, + "learning_rate": 2.78382820614999e-06, + "loss": 0.047, + "step": 859 + }, + { + "epoch": 4.648648648648649, + "grad_norm": 3.249666690826416, + "learning_rate": 2.779609852391123e-06, + "loss": 0.1561, + "step": 860 + }, + { + "epoch": 4.654054054054054, + "grad_norm": 7.2313337326049805, + "learning_rate": 2.775390692309987e-06, + "loss": 0.2157, + "step": 861 + }, + { + "epoch": 4.65945945945946, + "grad_norm": 3.1866044998168945, + "learning_rate": 2.7711707380735443e-06, + "loss": 0.0782, + "step": 862 + }, + { + "epoch": 4.664864864864865, + "grad_norm": 3.714812755584717, + "learning_rate": 2.766950001851049e-06, + "loss": 0.2994, + "step": 863 + }, + { + "epoch": 4.6702702702702705, + "grad_norm": 3.0355515480041504, + "learning_rate": 2.7627284958140084e-06, + "loss": 0.109, + "step": 864 + }, + { + "epoch": 4.675675675675675, + "grad_norm": 2.8177638053894043, + "learning_rate": 2.7585062321361517e-06, + "loss": 0.2557, + "step": 865 + }, + { + "epoch": 4.681081081081081, + "grad_norm": 3.7162227630615234, + "learning_rate": 2.75428322299339e-06, + "loss": 0.0413, + "step": 866 + }, + { + "epoch": 4.686486486486486, + "grad_norm": 3.008643627166748, + "learning_rate": 2.7500594805637882e-06, + "loss": 0.0402, + "step": 867 + }, + { + "epoch": 4.691891891891892, + "grad_norm": 3.1683881282806396, + "learning_rate": 2.745835017027522e-06, + "loss": 0.1481, + "step": 868 + }, + { + "epoch": 4.697297297297297, + "grad_norm": 3.2899327278137207, + "learning_rate": 2.74160984456685e-06, + "loss": 0.2242, + "step": 869 + }, + { + "epoch": 4.702702702702703, + "grad_norm": 5.386324882507324, + "learning_rate": 2.737383975366071e-06, + "loss": 0.4693, + "step": 870 + }, + { + "epoch": 4.708108108108108, + "grad_norm": 3.0007741451263428, + "learning_rate": 2.7331574216114963e-06, + "loss": 0.1353, + "step": 871 + }, + { + "epoch": 4.713513513513513, + "grad_norm": 2.7533962726593018, + "learning_rate": 2.728930195491411e-06, + "loss": 0.157, + "step": 872 + }, + { + "epoch": 4.718918918918919, + "grad_norm": 3.349351167678833, + "learning_rate": 2.724702309196038e-06, + "loss": 0.1863, + "step": 873 + }, + { + "epoch": 4.724324324324324, + "grad_norm": 3.2562623023986816, + "learning_rate": 2.720473774917505e-06, + "loss": 0.2874, + "step": 874 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 3.4865262508392334, + "learning_rate": 2.716244604849807e-06, + "loss": 0.1021, + "step": 875 + }, + { + "epoch": 4.735135135135135, + "grad_norm": 3.793647289276123, + "learning_rate": 2.7120148111887732e-06, + "loss": 0.1046, + "step": 876 + }, + { + "epoch": 4.7405405405405405, + "grad_norm": 3.8841137886047363, + "learning_rate": 2.707784406132032e-06, + "loss": 0.0971, + "step": 877 + }, + { + "epoch": 4.745945945945946, + "grad_norm": 3.45615816116333, + "learning_rate": 2.703553401878972e-06, + "loss": 0.0507, + "step": 878 + }, + { + "epoch": 4.751351351351351, + "grad_norm": 3.578495502471924, + "learning_rate": 2.6993218106307146e-06, + "loss": 0.0616, + "step": 879 + }, + { + "epoch": 4.756756756756757, + "grad_norm": 4.271491527557373, + "learning_rate": 2.6950896445900685e-06, + "loss": 0.0908, + "step": 880 + }, + { + "epoch": 4.762162162162162, + "grad_norm": 3.889042615890503, + "learning_rate": 2.690856915961504e-06, + "loss": 0.2426, + "step": 881 + }, + { + "epoch": 4.767567567567568, + "grad_norm": 3.8519232273101807, + "learning_rate": 2.686623636951112e-06, + "loss": 0.1881, + "step": 882 + }, + { + "epoch": 4.772972972972973, + "grad_norm": 3.819518804550171, + "learning_rate": 2.6823898197665703e-06, + "loss": 0.1385, + "step": 883 + }, + { + "epoch": 4.778378378378378, + "grad_norm": 4.091328144073486, + "learning_rate": 2.6781554766171104e-06, + "loss": 0.2913, + "step": 884 + }, + { + "epoch": 4.783783783783784, + "grad_norm": 2.60793399810791, + "learning_rate": 2.673920619713478e-06, + "loss": 0.0874, + "step": 885 + }, + { + "epoch": 4.789189189189189, + "grad_norm": 4.59322452545166, + "learning_rate": 2.6696852612679024e-06, + "loss": 0.2703, + "step": 886 + }, + { + "epoch": 4.794594594594595, + "grad_norm": 3.4631619453430176, + "learning_rate": 2.6654494134940586e-06, + "loss": 0.121, + "step": 887 + }, + { + "epoch": 4.8, + "grad_norm": 3.8556058406829834, + "learning_rate": 2.6612130886070313e-06, + "loss": 0.1853, + "step": 888 + }, + { + "epoch": 4.805405405405406, + "grad_norm": 2.932152271270752, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.0533, + "step": 889 + }, + { + "epoch": 4.8108108108108105, + "grad_norm": 4.647441387176514, + "learning_rate": 2.652739056360618e-06, + "loss": 0.3178, + "step": 890 + }, + { + "epoch": 4.816216216216216, + "grad_norm": 4.682106018066406, + "learning_rate": 2.648501373438142e-06, + "loss": 0.1735, + "step": 891 + }, + { + "epoch": 4.821621621621622, + "grad_norm": 3.1454825401306152, + "learning_rate": 2.644263262276234e-06, + "loss": 0.062, + "step": 892 + }, + { + "epoch": 4.827027027027027, + "grad_norm": 3.579653739929199, + "learning_rate": 2.640024735096507e-06, + "loss": 0.1336, + "step": 893 + }, + { + "epoch": 4.832432432432433, + "grad_norm": 2.558265447616577, + "learning_rate": 2.6357858041217733e-06, + "loss": 0.1404, + "step": 894 + }, + { + "epoch": 4.837837837837838, + "grad_norm": 2.3879470825195312, + "learning_rate": 2.6315464815760104e-06, + "loss": 0.0373, + "step": 895 + }, + { + "epoch": 4.8432432432432435, + "grad_norm": 4.418992042541504, + "learning_rate": 2.6273067796843242e-06, + "loss": 0.3068, + "step": 896 + }, + { + "epoch": 4.848648648648648, + "grad_norm": 3.08585786819458, + "learning_rate": 2.6230667106729157e-06, + "loss": 0.2221, + "step": 897 + }, + { + "epoch": 4.854054054054054, + "grad_norm": 2.9488885402679443, + "learning_rate": 2.618826286769043e-06, + "loss": 0.1431, + "step": 898 + }, + { + "epoch": 4.859459459459459, + "grad_norm": 4.123927116394043, + "learning_rate": 2.614585520200989e-06, + "loss": 0.196, + "step": 899 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 4.289125919342041, + "learning_rate": 2.6103444231980233e-06, + "loss": 0.2509, + "step": 900 + }, + { + "epoch": 4.87027027027027, + "grad_norm": 3.0358095169067383, + "learning_rate": 2.606103007990371e-06, + "loss": 0.0747, + "step": 901 + }, + { + "epoch": 4.875675675675676, + "grad_norm": 3.6471376419067383, + "learning_rate": 2.601861286809172e-06, + "loss": 0.0494, + "step": 902 + }, + { + "epoch": 4.881081081081081, + "grad_norm": 3.424712896347046, + "learning_rate": 2.5976192718864497e-06, + "loss": 0.0901, + "step": 903 + }, + { + "epoch": 4.886486486486486, + "grad_norm": 4.047586441040039, + "learning_rate": 2.593376975455075e-06, + "loss": 0.0465, + "step": 904 + }, + { + "epoch": 4.891891891891892, + "grad_norm": 4.448032379150391, + "learning_rate": 2.5891344097487294e-06, + "loss": 0.0616, + "step": 905 + }, + { + "epoch": 4.897297297297297, + "grad_norm": 3.3522684574127197, + "learning_rate": 2.584891587001872e-06, + "loss": 0.087, + "step": 906 + }, + { + "epoch": 4.902702702702703, + "grad_norm": 2.979238986968994, + "learning_rate": 2.580648519449704e-06, + "loss": 0.053, + "step": 907 + }, + { + "epoch": 4.908108108108108, + "grad_norm": 6.049450397491455, + "learning_rate": 2.5764052193281287e-06, + "loss": 0.2707, + "step": 908 + }, + { + "epoch": 4.9135135135135135, + "grad_norm": 6.647163391113281, + "learning_rate": 2.5721616988737254e-06, + "loss": 0.3679, + "step": 909 + }, + { + "epoch": 4.918918918918919, + "grad_norm": 3.764979839324951, + "learning_rate": 2.567917970323704e-06, + "loss": 0.1929, + "step": 910 + }, + { + "epoch": 4.924324324324324, + "grad_norm": 3.5592362880706787, + "learning_rate": 2.5636740459158776e-06, + "loss": 0.2461, + "step": 911 + }, + { + "epoch": 4.92972972972973, + "grad_norm": 4.4554762840271, + "learning_rate": 2.559429937888624e-06, + "loss": 0.2484, + "step": 912 + }, + { + "epoch": 4.935135135135135, + "grad_norm": 3.358375072479248, + "learning_rate": 2.5551856584808483e-06, + "loss": 0.1886, + "step": 913 + }, + { + "epoch": 4.940540540540541, + "grad_norm": 3.5831756591796875, + "learning_rate": 2.5509412199319515e-06, + "loss": 0.1789, + "step": 914 + }, + { + "epoch": 4.945945945945946, + "grad_norm": 2.4555728435516357, + "learning_rate": 2.5466966344817927e-06, + "loss": 0.1072, + "step": 915 + }, + { + "epoch": 4.951351351351351, + "grad_norm": 4.581109046936035, + "learning_rate": 2.542451914370656e-06, + "loss": 0.2624, + "step": 916 + }, + { + "epoch": 4.956756756756757, + "grad_norm": 2.9763975143432617, + "learning_rate": 2.538207071839213e-06, + "loss": 0.0639, + "step": 917 + }, + { + "epoch": 4.962162162162162, + "grad_norm": 3.516282796859741, + "learning_rate": 2.533962119128487e-06, + "loss": 0.1281, + "step": 918 + }, + { + "epoch": 4.967567567567568, + "grad_norm": 3.0369791984558105, + "learning_rate": 2.529717068479821e-06, + "loss": 0.1771, + "step": 919 + }, + { + "epoch": 4.972972972972973, + "grad_norm": 2.998521327972412, + "learning_rate": 2.5254719321348392e-06, + "loss": 0.2582, + "step": 920 + }, + { + "epoch": 4.978378378378379, + "grad_norm": 3.002901792526245, + "learning_rate": 2.5212267223354143e-06, + "loss": 0.3016, + "step": 921 + }, + { + "epoch": 4.9837837837837835, + "grad_norm": 3.564932346343994, + "learning_rate": 2.5169814513236296e-06, + "loss": 0.2775, + "step": 922 + }, + { + "epoch": 4.989189189189189, + "grad_norm": 3.726227283477783, + "learning_rate": 2.5127361313417447e-06, + "loss": 0.1246, + "step": 923 + }, + { + "epoch": 4.994594594594595, + "grad_norm": 4.766391754150391, + "learning_rate": 2.508490774632162e-06, + "loss": 0.1732, + "step": 924 + }, + { + "epoch": 5.0, + "grad_norm": 2.9859752655029297, + "learning_rate": 2.5042453934373874e-06, + "loss": 0.1107, + "step": 925 + }, + { + "epoch": 5.005405405405406, + "grad_norm": 3.4388909339904785, + "learning_rate": 2.5e-06, + "loss": 0.1074, + "step": 926 + }, + { + "epoch": 5.010810810810811, + "grad_norm": 2.959311008453369, + "learning_rate": 2.4957546065626134e-06, + "loss": 0.0752, + "step": 927 + }, + { + "epoch": 5.0162162162162165, + "grad_norm": 2.047055959701538, + "learning_rate": 2.491509225367839e-06, + "loss": 0.0313, + "step": 928 + }, + { + "epoch": 5.021621621621621, + "grad_norm": 2.310882329940796, + "learning_rate": 2.487263868658256e-06, + "loss": 0.0851, + "step": 929 + }, + { + "epoch": 5.027027027027027, + "grad_norm": 2.3032779693603516, + "learning_rate": 2.483018548676371e-06, + "loss": 0.0443, + "step": 930 + }, + { + "epoch": 5.032432432432432, + "grad_norm": 3.521470785140991, + "learning_rate": 2.478773277664587e-06, + "loss": 0.056, + "step": 931 + }, + { + "epoch": 5.037837837837838, + "grad_norm": 3.8374359607696533, + "learning_rate": 2.4745280678651616e-06, + "loss": 0.1668, + "step": 932 + }, + { + "epoch": 5.043243243243243, + "grad_norm": 3.831840753555298, + "learning_rate": 2.47028293152018e-06, + "loss": 0.0502, + "step": 933 + }, + { + "epoch": 5.048648648648649, + "grad_norm": 3.398419141769409, + "learning_rate": 2.4660378808715147e-06, + "loss": 0.023, + "step": 934 + }, + { + "epoch": 5.054054054054054, + "grad_norm": 3.3384788036346436, + "learning_rate": 2.4617929281607885e-06, + "loss": 0.1418, + "step": 935 + }, + { + "epoch": 5.059459459459459, + "grad_norm": 5.451812744140625, + "learning_rate": 2.457548085629345e-06, + "loss": 0.1167, + "step": 936 + }, + { + "epoch": 5.064864864864865, + "grad_norm": 6.509985446929932, + "learning_rate": 2.4533033655182072e-06, + "loss": 0.0781, + "step": 937 + }, + { + "epoch": 5.07027027027027, + "grad_norm": 4.330167770385742, + "learning_rate": 2.449058780068049e-06, + "loss": 0.0799, + "step": 938 + }, + { + "epoch": 5.075675675675676, + "grad_norm": 3.6900534629821777, + "learning_rate": 2.444814341519152e-06, + "loss": 0.0548, + "step": 939 + }, + { + "epoch": 5.081081081081081, + "grad_norm": 3.347656011581421, + "learning_rate": 2.440570062111376e-06, + "loss": 0.1218, + "step": 940 + }, + { + "epoch": 5.0864864864864865, + "grad_norm": 2.6146252155303955, + "learning_rate": 2.436325954084122e-06, + "loss": 0.0182, + "step": 941 + }, + { + "epoch": 5.091891891891892, + "grad_norm": 2.852694034576416, + "learning_rate": 2.4320820296762964e-06, + "loss": 0.0337, + "step": 942 + }, + { + "epoch": 5.097297297297297, + "grad_norm": 1.9230271577835083, + "learning_rate": 2.4278383011262755e-06, + "loss": 0.0226, + "step": 943 + }, + { + "epoch": 5.102702702702703, + "grad_norm": 2.6784677505493164, + "learning_rate": 2.4235947806718717e-06, + "loss": 0.0207, + "step": 944 + }, + { + "epoch": 5.108108108108108, + "grad_norm": 3.4410207271575928, + "learning_rate": 2.4193514805502972e-06, + "loss": 0.1561, + "step": 945 + }, + { + "epoch": 5.113513513513514, + "grad_norm": 3.165294647216797, + "learning_rate": 2.4151084129981284e-06, + "loss": 0.1727, + "step": 946 + }, + { + "epoch": 5.118918918918919, + "grad_norm": 2.743256092071533, + "learning_rate": 2.4108655902512715e-06, + "loss": 0.1246, + "step": 947 + }, + { + "epoch": 5.124324324324324, + "grad_norm": 3.771273374557495, + "learning_rate": 2.406623024544926e-06, + "loss": 0.1429, + "step": 948 + }, + { + "epoch": 5.12972972972973, + "grad_norm": 3.4866952896118164, + "learning_rate": 2.402380728113551e-06, + "loss": 0.1569, + "step": 949 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 3.5998377799987793, + "learning_rate": 2.3981387131908286e-06, + "loss": 0.1105, + "step": 950 + }, + { + "epoch": 5.140540540540541, + "grad_norm": 6.748101234436035, + "learning_rate": 2.39389699200963e-06, + "loss": 0.3786, + "step": 951 + }, + { + "epoch": 5.145945945945946, + "grad_norm": 4.391526699066162, + "learning_rate": 2.389655576801977e-06, + "loss": 0.0826, + "step": 952 + }, + { + "epoch": 5.151351351351352, + "grad_norm": 4.411531448364258, + "learning_rate": 2.3854144797990123e-06, + "loss": 0.0684, + "step": 953 + }, + { + "epoch": 5.1567567567567565, + "grad_norm": 3.2221450805664062, + "learning_rate": 2.3811737132309584e-06, + "loss": 0.0452, + "step": 954 + }, + { + "epoch": 5.162162162162162, + "grad_norm": 2.926665782928467, + "learning_rate": 2.3769332893270856e-06, + "loss": 0.0465, + "step": 955 + }, + { + "epoch": 5.167567567567567, + "grad_norm": 2.909715414047241, + "learning_rate": 2.372693220315677e-06, + "loss": 0.0551, + "step": 956 + }, + { + "epoch": 5.172972972972973, + "grad_norm": 3.3920676708221436, + "learning_rate": 2.36845351842399e-06, + "loss": 0.0896, + "step": 957 + }, + { + "epoch": 5.178378378378379, + "grad_norm": 2.4355857372283936, + "learning_rate": 2.3642141958782267e-06, + "loss": 0.0565, + "step": 958 + }, + { + "epoch": 5.183783783783784, + "grad_norm": 4.707484722137451, + "learning_rate": 2.3599752649034935e-06, + "loss": 0.1563, + "step": 959 + }, + { + "epoch": 5.1891891891891895, + "grad_norm": 2.0196712017059326, + "learning_rate": 2.3557367377237663e-06, + "loss": 0.0236, + "step": 960 + }, + { + "epoch": 5.194594594594594, + "grad_norm": 2.5355868339538574, + "learning_rate": 2.351498626561858e-06, + "loss": 0.0506, + "step": 961 + }, + { + "epoch": 5.2, + "grad_norm": 3.384859800338745, + "learning_rate": 2.3472609436393827e-06, + "loss": 0.1001, + "step": 962 + }, + { + "epoch": 5.205405405405405, + "grad_norm": 3.557605028152466, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.0951, + "step": 963 + }, + { + "epoch": 5.210810810810811, + "grad_norm": 2.9991750717163086, + "learning_rate": 2.3387869113929695e-06, + "loss": 0.0824, + "step": 964 + }, + { + "epoch": 5.216216216216216, + "grad_norm": 3.3849830627441406, + "learning_rate": 2.3345505865059427e-06, + "loss": 0.0485, + "step": 965 + }, + { + "epoch": 5.221621621621622, + "grad_norm": 3.781913995742798, + "learning_rate": 2.3303147387320985e-06, + "loss": 0.1516, + "step": 966 + }, + { + "epoch": 5.227027027027027, + "grad_norm": 3.5771679878234863, + "learning_rate": 2.3260793802865227e-06, + "loss": 0.1664, + "step": 967 + }, + { + "epoch": 5.232432432432432, + "grad_norm": 3.4213743209838867, + "learning_rate": 2.3218445233828904e-06, + "loss": 0.1127, + "step": 968 + }, + { + "epoch": 5.237837837837838, + "grad_norm": 3.315171003341675, + "learning_rate": 2.31761018023343e-06, + "loss": 0.0445, + "step": 969 + }, + { + "epoch": 5.243243243243243, + "grad_norm": 4.793919563293457, + "learning_rate": 2.3133763630488883e-06, + "loss": 0.1402, + "step": 970 + }, + { + "epoch": 5.248648648648649, + "grad_norm": 2.4062092304229736, + "learning_rate": 2.3091430840384964e-06, + "loss": 0.0332, + "step": 971 + }, + { + "epoch": 5.254054054054054, + "grad_norm": 3.5533835887908936, + "learning_rate": 2.304910355409932e-06, + "loss": 0.1266, + "step": 972 + }, + { + "epoch": 5.2594594594594595, + "grad_norm": 3.447761058807373, + "learning_rate": 2.3006781893692863e-06, + "loss": 0.0281, + "step": 973 + }, + { + "epoch": 5.264864864864865, + "grad_norm": 2.2596893310546875, + "learning_rate": 2.2964465981210283e-06, + "loss": 0.0238, + "step": 974 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 2.9317407608032227, + "learning_rate": 2.2922155938679695e-06, + "loss": 0.0828, + "step": 975 + }, + { + "epoch": 5.275675675675676, + "grad_norm": 4.982219219207764, + "learning_rate": 2.287985188811228e-06, + "loss": 0.1874, + "step": 976 + }, + { + "epoch": 5.281081081081081, + "grad_norm": 2.643747091293335, + "learning_rate": 2.2837553951501935e-06, + "loss": 0.0413, + "step": 977 + }, + { + "epoch": 5.286486486486487, + "grad_norm": 3.7542672157287598, + "learning_rate": 2.279526225082495e-06, + "loss": 0.0909, + "step": 978 + }, + { + "epoch": 5.291891891891892, + "grad_norm": 4.562160015106201, + "learning_rate": 2.275297690803962e-06, + "loss": 0.0798, + "step": 979 + }, + { + "epoch": 5.297297297297297, + "grad_norm": 3.627634048461914, + "learning_rate": 2.271069804508589e-06, + "loss": 0.1456, + "step": 980 + }, + { + "epoch": 5.302702702702703, + "grad_norm": 3.0197503566741943, + "learning_rate": 2.266842578388504e-06, + "loss": 0.085, + "step": 981 + }, + { + "epoch": 5.308108108108108, + "grad_norm": 3.1097187995910645, + "learning_rate": 2.2626160246339303e-06, + "loss": 0.0885, + "step": 982 + }, + { + "epoch": 5.313513513513514, + "grad_norm": 3.504622459411621, + "learning_rate": 2.2583901554331513e-06, + "loss": 0.1543, + "step": 983 + }, + { + "epoch": 5.318918918918919, + "grad_norm": 3.6203200817108154, + "learning_rate": 2.2541649829724783e-06, + "loss": 0.06, + "step": 984 + }, + { + "epoch": 5.324324324324325, + "grad_norm": 3.441621780395508, + "learning_rate": 2.249940519436212e-06, + "loss": 0.0518, + "step": 985 + }, + { + "epoch": 5.3297297297297295, + "grad_norm": 3.5617616176605225, + "learning_rate": 2.2457167770066104e-06, + "loss": 0.1542, + "step": 986 + }, + { + "epoch": 5.335135135135135, + "grad_norm": 2.4165892601013184, + "learning_rate": 2.2414937678638495e-06, + "loss": 0.0338, + "step": 987 + }, + { + "epoch": 5.34054054054054, + "grad_norm": 2.450880289077759, + "learning_rate": 2.2372715041859925e-06, + "loss": 0.0204, + "step": 988 + }, + { + "epoch": 5.345945945945946, + "grad_norm": 3.0658836364746094, + "learning_rate": 2.2330499981489524e-06, + "loss": 0.129, + "step": 989 + }, + { + "epoch": 5.351351351351352, + "grad_norm": 2.368131160736084, + "learning_rate": 2.2288292619264566e-06, + "loss": 0.0307, + "step": 990 + }, + { + "epoch": 5.356756756756757, + "grad_norm": 2.3199515342712402, + "learning_rate": 2.2246093076900145e-06, + "loss": 0.0374, + "step": 991 + }, + { + "epoch": 5.3621621621621625, + "grad_norm": 2.5552587509155273, + "learning_rate": 2.220390147608878e-06, + "loss": 0.0265, + "step": 992 + }, + { + "epoch": 5.367567567567567, + "grad_norm": 3.5336551666259766, + "learning_rate": 2.2161717938500112e-06, + "loss": 0.0468, + "step": 993 + }, + { + "epoch": 5.372972972972973, + "grad_norm": 2.8977596759796143, + "learning_rate": 2.2119542585780513e-06, + "loss": 0.1118, + "step": 994 + }, + { + "epoch": 5.378378378378378, + "grad_norm": 4.2495951652526855, + "learning_rate": 2.2077375539552764e-06, + "loss": 0.2056, + "step": 995 + }, + { + "epoch": 5.383783783783784, + "grad_norm": 3.5974740982055664, + "learning_rate": 2.203521692141568e-06, + "loss": 0.0437, + "step": 996 + }, + { + "epoch": 5.389189189189189, + "grad_norm": 4.290375232696533, + "learning_rate": 2.199306685294377e-06, + "loss": 0.1981, + "step": 997 + }, + { + "epoch": 5.394594594594595, + "grad_norm": 3.3619349002838135, + "learning_rate": 2.1950925455686906e-06, + "loss": 0.0756, + "step": 998 + }, + { + "epoch": 5.4, + "grad_norm": 2.673149585723877, + "learning_rate": 2.1908792851169954e-06, + "loss": 0.0998, + "step": 999 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 2.308863401412964, + "learning_rate": 2.186666916089239e-06, + "loss": 0.0223, + "step": 1000 + }, + { + "epoch": 5.410810810810811, + "grad_norm": 2.606580972671509, + "learning_rate": 2.1824554506328033e-06, + "loss": 0.0489, + "step": 1001 + }, + { + "epoch": 5.416216216216216, + "grad_norm": 1.9544821977615356, + "learning_rate": 2.17824490089246e-06, + "loss": 0.0321, + "step": 1002 + }, + { + "epoch": 5.421621621621622, + "grad_norm": 2.374169111251831, + "learning_rate": 2.174035279010343e-06, + "loss": 0.0167, + "step": 1003 + }, + { + "epoch": 5.427027027027027, + "grad_norm": 2.8189785480499268, + "learning_rate": 2.1698265971259104e-06, + "loss": 0.0588, + "step": 1004 + }, + { + "epoch": 5.4324324324324325, + "grad_norm": 3.0042636394500732, + "learning_rate": 2.1656188673759065e-06, + "loss": 0.0868, + "step": 1005 + }, + { + "epoch": 5.437837837837838, + "grad_norm": 3.351011276245117, + "learning_rate": 2.1614121018943346e-06, + "loss": 0.1131, + "step": 1006 + }, + { + "epoch": 5.443243243243243, + "grad_norm": 1.8294633626937866, + "learning_rate": 2.1572063128124133e-06, + "loss": 0.0285, + "step": 1007 + }, + { + "epoch": 5.448648648648649, + "grad_norm": 2.9738781452178955, + "learning_rate": 2.153001512258548e-06, + "loss": 0.0303, + "step": 1008 + }, + { + "epoch": 5.454054054054054, + "grad_norm": 3.807075023651123, + "learning_rate": 2.1487977123582922e-06, + "loss": 0.3278, + "step": 1009 + }, + { + "epoch": 5.45945945945946, + "grad_norm": 2.4742624759674072, + "learning_rate": 2.144594925234314e-06, + "loss": 0.0346, + "step": 1010 + }, + { + "epoch": 5.464864864864865, + "grad_norm": 2.3810906410217285, + "learning_rate": 2.140393163006362e-06, + "loss": 0.0874, + "step": 1011 + }, + { + "epoch": 5.47027027027027, + "grad_norm": 2.964308738708496, + "learning_rate": 2.1361924377912266e-06, + "loss": 0.0194, + "step": 1012 + }, + { + "epoch": 5.475675675675676, + "grad_norm": 4.374764919281006, + "learning_rate": 2.1319927617027112e-06, + "loss": 0.1193, + "step": 1013 + }, + { + "epoch": 5.481081081081081, + "grad_norm": 2.9093267917633057, + "learning_rate": 2.1277941468515908e-06, + "loss": 0.0331, + "step": 1014 + }, + { + "epoch": 5.486486486486487, + "grad_norm": 3.3543128967285156, + "learning_rate": 2.123596605345582e-06, + "loss": 0.0723, + "step": 1015 + }, + { + "epoch": 5.491891891891892, + "grad_norm": 3.7927865982055664, + "learning_rate": 2.119400149289305e-06, + "loss": 0.0751, + "step": 1016 + }, + { + "epoch": 5.4972972972972975, + "grad_norm": 2.6409950256347656, + "learning_rate": 2.11520479078425e-06, + "loss": 0.0265, + "step": 1017 + }, + { + "epoch": 5.5027027027027025, + "grad_norm": 3.3015005588531494, + "learning_rate": 2.111010541928743e-06, + "loss": 0.1023, + "step": 1018 + }, + { + "epoch": 5.508108108108108, + "grad_norm": 3.591866970062256, + "learning_rate": 2.10681741481791e-06, + "loss": 0.0831, + "step": 1019 + }, + { + "epoch": 5.513513513513513, + "grad_norm": 3.2032251358032227, + "learning_rate": 2.1026254215436408e-06, + "loss": 0.1258, + "step": 1020 + }, + { + "epoch": 5.518918918918919, + "grad_norm": 2.9865293502807617, + "learning_rate": 2.098434574194557e-06, + "loss": 0.0926, + "step": 1021 + }, + { + "epoch": 5.524324324324324, + "grad_norm": 2.4018800258636475, + "learning_rate": 2.094244884855974e-06, + "loss": 0.0306, + "step": 1022 + }, + { + "epoch": 5.52972972972973, + "grad_norm": 3.0807738304138184, + "learning_rate": 2.0900563656098706e-06, + "loss": 0.1374, + "step": 1023 + }, + { + "epoch": 5.535135135135135, + "grad_norm": 3.1328487396240234, + "learning_rate": 2.085869028534848e-06, + "loss": 0.1173, + "step": 1024 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 3.2709290981292725, + "learning_rate": 2.0816828857061e-06, + "loss": 0.146, + "step": 1025 + }, + { + "epoch": 5.545945945945946, + "grad_norm": 4.698089122772217, + "learning_rate": 2.077497949195378e-06, + "loss": 0.1542, + "step": 1026 + }, + { + "epoch": 5.551351351351351, + "grad_norm": 2.902589797973633, + "learning_rate": 2.073314231070951e-06, + "loss": 0.0699, + "step": 1027 + }, + { + "epoch": 5.556756756756757, + "grad_norm": 4.043124198913574, + "learning_rate": 2.069131743397578e-06, + "loss": 0.1429, + "step": 1028 + }, + { + "epoch": 5.562162162162162, + "grad_norm": 3.168281316757202, + "learning_rate": 2.0649504982364674e-06, + "loss": 0.1203, + "step": 1029 + }, + { + "epoch": 5.5675675675675675, + "grad_norm": 2.7638514041900635, + "learning_rate": 2.0607705076452465e-06, + "loss": 0.1078, + "step": 1030 + }, + { + "epoch": 5.572972972972973, + "grad_norm": 3.3716790676116943, + "learning_rate": 2.056591783677923e-06, + "loss": 0.0881, + "step": 1031 + }, + { + "epoch": 5.578378378378378, + "grad_norm": 3.6879029273986816, + "learning_rate": 2.0524143383848525e-06, + "loss": 0.0586, + "step": 1032 + }, + { + "epoch": 5.583783783783784, + "grad_norm": 5.253712177276611, + "learning_rate": 2.048238183812704e-06, + "loss": 0.3671, + "step": 1033 + }, + { + "epoch": 5.589189189189189, + "grad_norm": 3.237152099609375, + "learning_rate": 2.0440633320044224e-06, + "loss": 0.048, + "step": 1034 + }, + { + "epoch": 5.594594594594595, + "grad_norm": 3.8771812915802, + "learning_rate": 2.0398897949991992e-06, + "loss": 0.2091, + "step": 1035 + }, + { + "epoch": 5.6, + "grad_norm": 4.612788200378418, + "learning_rate": 2.0357175848324306e-06, + "loss": 0.1295, + "step": 1036 + }, + { + "epoch": 5.605405405405405, + "grad_norm": 3.0990102291107178, + "learning_rate": 2.031546713535688e-06, + "loss": 0.0504, + "step": 1037 + }, + { + "epoch": 5.610810810810811, + "grad_norm": 4.607776641845703, + "learning_rate": 2.027377193136684e-06, + "loss": 0.1816, + "step": 1038 + }, + { + "epoch": 5.616216216216216, + "grad_norm": 2.6812732219696045, + "learning_rate": 2.0232090356592333e-06, + "loss": 0.0392, + "step": 1039 + }, + { + "epoch": 5.621621621621622, + "grad_norm": 2.9481258392333984, + "learning_rate": 2.0190422531232186e-06, + "loss": 0.0273, + "step": 1040 + }, + { + "epoch": 5.627027027027027, + "grad_norm": 2.7125625610351562, + "learning_rate": 2.014876857544562e-06, + "loss": 0.0672, + "step": 1041 + }, + { + "epoch": 5.632432432432433, + "grad_norm": 3.4124906063079834, + "learning_rate": 2.0107128609351817e-06, + "loss": 0.0749, + "step": 1042 + }, + { + "epoch": 5.6378378378378375, + "grad_norm": 2.9229767322540283, + "learning_rate": 2.006550275302965e-06, + "loss": 0.0713, + "step": 1043 + }, + { + "epoch": 5.643243243243243, + "grad_norm": 3.2177693843841553, + "learning_rate": 2.002389112651728e-06, + "loss": 0.0547, + "step": 1044 + }, + { + "epoch": 5.648648648648649, + "grad_norm": 2.5188214778900146, + "learning_rate": 1.9982293849811852e-06, + "loss": 0.0304, + "step": 1045 + }, + { + "epoch": 5.654054054054054, + "grad_norm": 2.8611507415771484, + "learning_rate": 1.994071104286911e-06, + "loss": 0.0227, + "step": 1046 + }, + { + "epoch": 5.65945945945946, + "grad_norm": 2.2558059692382812, + "learning_rate": 1.9899142825603078e-06, + "loss": 0.0811, + "step": 1047 + }, + { + "epoch": 5.664864864864865, + "grad_norm": 2.3414204120635986, + "learning_rate": 1.9857589317885727e-06, + "loss": 0.0292, + "step": 1048 + }, + { + "epoch": 5.6702702702702705, + "grad_norm": 2.4263527393341064, + "learning_rate": 1.9816050639546566e-06, + "loss": 0.0386, + "step": 1049 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 3.6473093032836914, + "learning_rate": 1.977452691037239e-06, + "loss": 0.1448, + "step": 1050 + }, + { + "epoch": 5.681081081081081, + "grad_norm": 2.8061227798461914, + "learning_rate": 1.973301825010685e-06, + "loss": 0.0451, + "step": 1051 + }, + { + "epoch": 5.686486486486486, + "grad_norm": 2.5342822074890137, + "learning_rate": 1.9691524778450145e-06, + "loss": 0.0708, + "step": 1052 + }, + { + "epoch": 5.691891891891892, + "grad_norm": 2.632966995239258, + "learning_rate": 1.96500466150587e-06, + "loss": 0.0311, + "step": 1053 + }, + { + "epoch": 5.697297297297297, + "grad_norm": 2.9255290031433105, + "learning_rate": 1.960858387954476e-06, + "loss": 0.0728, + "step": 1054 + }, + { + "epoch": 5.702702702702703, + "grad_norm": 3.292577028274536, + "learning_rate": 1.956713669147611e-06, + "loss": 0.1429, + "step": 1055 + }, + { + "epoch": 5.708108108108108, + "grad_norm": 2.7926251888275146, + "learning_rate": 1.9525705170375674e-06, + "loss": 0.0702, + "step": 1056 + }, + { + "epoch": 5.713513513513513, + "grad_norm": 2.432650089263916, + "learning_rate": 1.948428943572121e-06, + "loss": 0.0934, + "step": 1057 + }, + { + "epoch": 5.718918918918919, + "grad_norm": 2.3497097492218018, + "learning_rate": 1.944288960694497e-06, + "loss": 0.0327, + "step": 1058 + }, + { + "epoch": 5.724324324324324, + "grad_norm": 2.9069294929504395, + "learning_rate": 1.9401505803433308e-06, + "loss": 0.1025, + "step": 1059 + }, + { + "epoch": 5.72972972972973, + "grad_norm": 3.2904415130615234, + "learning_rate": 1.9360138144526363e-06, + "loss": 0.0825, + "step": 1060 + }, + { + "epoch": 5.735135135135135, + "grad_norm": 3.0035643577575684, + "learning_rate": 1.9318786749517754e-06, + "loss": 0.164, + "step": 1061 + }, + { + "epoch": 5.7405405405405405, + "grad_norm": 3.595271110534668, + "learning_rate": 1.9277451737654154e-06, + "loss": 0.0574, + "step": 1062 + }, + { + "epoch": 5.745945945945946, + "grad_norm": 3.5074777603149414, + "learning_rate": 1.923613322813503e-06, + "loss": 0.2916, + "step": 1063 + }, + { + "epoch": 5.751351351351351, + "grad_norm": 2.7535500526428223, + "learning_rate": 1.9194831340112228e-06, + "loss": 0.0626, + "step": 1064 + }, + { + "epoch": 5.756756756756757, + "grad_norm": 2.958237886428833, + "learning_rate": 1.915354619268969e-06, + "loss": 0.0544, + "step": 1065 + }, + { + "epoch": 5.762162162162162, + "grad_norm": 2.9726474285125732, + "learning_rate": 1.9112277904923064e-06, + "loss": 0.0145, + "step": 1066 + }, + { + "epoch": 5.767567567567568, + "grad_norm": 2.744746446609497, + "learning_rate": 1.9071026595819387e-06, + "loss": 0.0335, + "step": 1067 + }, + { + "epoch": 5.772972972972973, + "grad_norm": 3.1849920749664307, + "learning_rate": 1.902979238433673e-06, + "loss": 0.1385, + "step": 1068 + }, + { + "epoch": 5.778378378378378, + "grad_norm": 2.9969868659973145, + "learning_rate": 1.8988575389383853e-06, + "loss": 0.0523, + "step": 1069 + }, + { + "epoch": 5.783783783783784, + "grad_norm": 3.8293309211730957, + "learning_rate": 1.8947375729819894e-06, + "loss": 0.171, + "step": 1070 + }, + { + "epoch": 5.789189189189189, + "grad_norm": 2.845538854598999, + "learning_rate": 1.8906193524453964e-06, + "loss": 0.0431, + "step": 1071 + }, + { + "epoch": 5.794594594594595, + "grad_norm": 1.819235920906067, + "learning_rate": 1.886502889204487e-06, + "loss": 0.0157, + "step": 1072 + }, + { + "epoch": 5.8, + "grad_norm": 3.492358684539795, + "learning_rate": 1.882388195130073e-06, + "loss": 0.0892, + "step": 1073 + }, + { + "epoch": 5.805405405405406, + "grad_norm": 2.1627602577209473, + "learning_rate": 1.8782752820878636e-06, + "loss": 0.0376, + "step": 1074 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 3.6203341484069824, + "learning_rate": 1.8741641619384343e-06, + "loss": 0.1174, + "step": 1075 + }, + { + "epoch": 5.816216216216216, + "grad_norm": 2.3573997020721436, + "learning_rate": 1.8700548465371877e-06, + "loss": 0.0191, + "step": 1076 + }, + { + "epoch": 5.821621621621622, + "grad_norm": 3.5267531871795654, + "learning_rate": 1.8659473477343233e-06, + "loss": 0.1243, + "step": 1077 + }, + { + "epoch": 5.827027027027027, + "grad_norm": 3.5826189517974854, + "learning_rate": 1.8618416773748032e-06, + "loss": 0.1457, + "step": 1078 + }, + { + "epoch": 5.832432432432433, + "grad_norm": 2.7825980186462402, + "learning_rate": 1.8577378472983148e-06, + "loss": 0.0366, + "step": 1079 + }, + { + "epoch": 5.837837837837838, + "grad_norm": 2.7613232135772705, + "learning_rate": 1.8536358693392398e-06, + "loss": 0.065, + "step": 1080 + }, + { + "epoch": 5.8432432432432435, + "grad_norm": 3.1205132007598877, + "learning_rate": 1.8495357553266176e-06, + "loss": 0.1902, + "step": 1081 + }, + { + "epoch": 5.848648648648648, + "grad_norm": 2.7488930225372314, + "learning_rate": 1.8454375170841133e-06, + "loss": 0.0372, + "step": 1082 + }, + { + "epoch": 5.854054054054054, + "grad_norm": 3.496779441833496, + "learning_rate": 1.841341166429983e-06, + "loss": 0.0942, + "step": 1083 + }, + { + "epoch": 5.859459459459459, + "grad_norm": 3.724827527999878, + "learning_rate": 1.8372467151770391e-06, + "loss": 0.2317, + "step": 1084 + }, + { + "epoch": 5.864864864864865, + "grad_norm": 4.659550666809082, + "learning_rate": 1.8331541751326168e-06, + "loss": 0.1935, + "step": 1085 + }, + { + "epoch": 5.87027027027027, + "grad_norm": 4.368297100067139, + "learning_rate": 1.8290635580985395e-06, + "loss": 0.0905, + "step": 1086 + }, + { + "epoch": 5.875675675675676, + "grad_norm": 2.669170618057251, + "learning_rate": 1.8249748758710856e-06, + "loss": 0.0931, + "step": 1087 + }, + { + "epoch": 5.881081081081081, + "grad_norm": 2.9962668418884277, + "learning_rate": 1.8208881402409542e-06, + "loss": 0.0878, + "step": 1088 + }, + { + "epoch": 5.886486486486486, + "grad_norm": 4.08193302154541, + "learning_rate": 1.8168033629932296e-06, + "loss": 0.1317, + "step": 1089 + }, + { + "epoch": 5.891891891891892, + "grad_norm": 3.038261651992798, + "learning_rate": 1.8127205559073507e-06, + "loss": 0.027, + "step": 1090 + }, + { + "epoch": 5.897297297297297, + "grad_norm": 3.1188318729400635, + "learning_rate": 1.8086397307570724e-06, + "loss": 0.0872, + "step": 1091 + }, + { + "epoch": 5.902702702702703, + "grad_norm": 3.2329025268554688, + "learning_rate": 1.8045608993104373e-06, + "loss": 0.0821, + "step": 1092 + }, + { + "epoch": 5.908108108108108, + "grad_norm": 3.268589735031128, + "learning_rate": 1.8004840733297365e-06, + "loss": 0.0327, + "step": 1093 + }, + { + "epoch": 5.9135135135135135, + "grad_norm": 2.68831729888916, + "learning_rate": 1.7964092645714777e-06, + "loss": 0.0497, + "step": 1094 + }, + { + "epoch": 5.918918918918919, + "grad_norm": 2.5666730403900146, + "learning_rate": 1.7923364847863527e-06, + "loss": 0.0307, + "step": 1095 + }, + { + "epoch": 5.924324324324324, + "grad_norm": 4.285571098327637, + "learning_rate": 1.7882657457192015e-06, + "loss": 0.0897, + "step": 1096 + }, + { + "epoch": 5.92972972972973, + "grad_norm": 4.338192939758301, + "learning_rate": 1.784197059108979e-06, + "loss": 0.1545, + "step": 1097 + }, + { + "epoch": 5.935135135135135, + "grad_norm": 3.0083415508270264, + "learning_rate": 1.7801304366887235e-06, + "loss": 0.0509, + "step": 1098 + }, + { + "epoch": 5.940540540540541, + "grad_norm": 5.343819618225098, + "learning_rate": 1.776065890185517e-06, + "loss": 0.0821, + "step": 1099 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 2.2563998699188232, + "learning_rate": 1.7720034313204582e-06, + "loss": 0.0182, + "step": 1100 + }, + { + "epoch": 5.951351351351351, + "grad_norm": 3.2145767211914062, + "learning_rate": 1.7679430718086244e-06, + "loss": 0.1027, + "step": 1101 + }, + { + "epoch": 5.956756756756757, + "grad_norm": 3.159283399581909, + "learning_rate": 1.763884823359038e-06, + "loss": 0.0413, + "step": 1102 + }, + { + "epoch": 5.962162162162162, + "grad_norm": 3.57746958732605, + "learning_rate": 1.759828697674636e-06, + "loss": 0.1079, + "step": 1103 + }, + { + "epoch": 5.967567567567568, + "grad_norm": 2.7590816020965576, + "learning_rate": 1.7557747064522312e-06, + "loss": 0.0952, + "step": 1104 + }, + { + "epoch": 5.972972972972973, + "grad_norm": 4.943508148193359, + "learning_rate": 1.7517228613824836e-06, + "loss": 0.3393, + "step": 1105 + }, + { + "epoch": 5.978378378378379, + "grad_norm": 1.6088807582855225, + "learning_rate": 1.747673174149862e-06, + "loss": 0.0207, + "step": 1106 + }, + { + "epoch": 5.9837837837837835, + "grad_norm": 3.843369483947754, + "learning_rate": 1.743625656432615e-06, + "loss": 0.1708, + "step": 1107 + }, + { + "epoch": 5.989189189189189, + "grad_norm": 2.520202159881592, + "learning_rate": 1.7395803199027325e-06, + "loss": 0.0569, + "step": 1108 + }, + { + "epoch": 5.994594594594595, + "grad_norm": 4.245851993560791, + "learning_rate": 1.7355371762259155e-06, + "loss": 0.0861, + "step": 1109 + }, + { + "epoch": 6.0, + "grad_norm": 2.90023136138916, + "learning_rate": 1.7314962370615423e-06, + "loss": 0.0571, + "step": 1110 + } + ], + "logging_steps": 1, + "max_steps": 1850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.994751443912622e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e3a3ec16e2a61ab89bce56c0f273b7c43363722c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json @@ -0,0 +1,2624 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 370, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005405405405405406, + "grad_norm": 72.60939025878906, + "learning_rate": 5e-06, + "loss": 2.9165, + "step": 1 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 29.01830291748047, + "learning_rate": 4.999996395324314e-06, + "loss": 1.9314, + "step": 2 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 21.44908332824707, + "learning_rate": 4.99998558130765e-06, + "loss": 1.5709, + "step": 3 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 4.490907669067383, + "learning_rate": 4.999967557981192e-06, + "loss": 0.8099, + "step": 4 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 4.000796794891357, + "learning_rate": 4.999942325396917e-06, + "loss": 0.9021, + "step": 5 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 18.513282775878906, + "learning_rate": 4.999909883627588e-06, + "loss": 1.7972, + "step": 6 + }, + { + "epoch": 0.03783783783783784, + "grad_norm": 3.5735981464385986, + "learning_rate": 4.999870232766757e-06, + "loss": 1.4306, + "step": 7 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 3.1145193576812744, + "learning_rate": 4.9998233729287696e-06, + "loss": 1.051, + "step": 8 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 3.856376886367798, + "learning_rate": 4.999769304248755e-06, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 4.05589485168457, + "learning_rate": 4.9997080268826344e-06, + "loss": 1.0999, + "step": 10 + }, + { + "epoch": 0.05945945945945946, + "grad_norm": 13.784229278564453, + "learning_rate": 4.9996395410071165e-06, + "loss": 1.2831, + "step": 11 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 6.079237937927246, + "learning_rate": 4.999563846819696e-06, + "loss": 1.2874, + "step": 12 + }, + { + "epoch": 0.07027027027027027, + "grad_norm": 4.5971245765686035, + "learning_rate": 4.999480944538655e-06, + "loss": 0.96, + "step": 13 + }, + { + "epoch": 0.07567567567567568, + "grad_norm": 4.916017532348633, + "learning_rate": 4.999390834403063e-06, + "loss": 0.9869, + "step": 14 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 3.2311055660247803, + "learning_rate": 4.999293516672773e-06, + "loss": 0.9293, + "step": 15 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 3.3040921688079834, + "learning_rate": 4.9991889916284255e-06, + "loss": 0.8914, + "step": 16 + }, + { + "epoch": 0.0918918918918919, + "grad_norm": 3.794267416000366, + "learning_rate": 4.999077259571442e-06, + "loss": 1.0176, + "step": 17 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 4.788509845733643, + "learning_rate": 4.998958320824031e-06, + "loss": 1.0259, + "step": 18 + }, + { + "epoch": 0.10270270270270271, + "grad_norm": 10.027527809143066, + "learning_rate": 4.998832175729179e-06, + "loss": 1.3356, + "step": 19 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 4.612483978271484, + "learning_rate": 4.998698824650656e-06, + "loss": 1.4486, + "step": 20 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 3.8676936626434326, + "learning_rate": 4.998558267973014e-06, + "loss": 0.8372, + "step": 21 + }, + { + "epoch": 0.11891891891891893, + "grad_norm": 2.9611001014709473, + "learning_rate": 4.998410506101579e-06, + "loss": 0.7931, + "step": 22 + }, + { + "epoch": 0.12432432432432433, + "grad_norm": 5.508745193481445, + "learning_rate": 4.9982555394624595e-06, + "loss": 1.3022, + "step": 23 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 3.434845209121704, + "learning_rate": 4.998093368502539e-06, + "loss": 0.9739, + "step": 24 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 4.736802101135254, + "learning_rate": 4.9979239936894765e-06, + "loss": 1.1154, + "step": 25 + }, + { + "epoch": 0.14054054054054055, + "grad_norm": 3.69411039352417, + "learning_rate": 4.997747415511705e-06, + "loss": 0.7543, + "step": 26 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 2.8646645545959473, + "learning_rate": 4.997563634478428e-06, + "loss": 0.7278, + "step": 27 + }, + { + "epoch": 0.15135135135135136, + "grad_norm": 6.56904935836792, + "learning_rate": 4.997372651119626e-06, + "loss": 0.8167, + "step": 28 + }, + { + "epoch": 0.15675675675675677, + "grad_norm": 2.955914258956909, + "learning_rate": 4.997174465986044e-06, + "loss": 0.8031, + "step": 29 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 2.5714259147644043, + "learning_rate": 4.996969079649196e-06, + "loss": 0.689, + "step": 30 + }, + { + "epoch": 0.16756756756756758, + "grad_norm": 3.5165364742279053, + "learning_rate": 4.996756492701362e-06, + "loss": 0.8059, + "step": 31 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 3.2861921787261963, + "learning_rate": 4.996536705755591e-06, + "loss": 0.9658, + "step": 32 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 2.962470531463623, + "learning_rate": 4.996309719445687e-06, + "loss": 0.8349, + "step": 33 + }, + { + "epoch": 0.1837837837837838, + "grad_norm": 2.7694804668426514, + "learning_rate": 4.996075534426223e-06, + "loss": 0.8287, + "step": 34 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 3.405071258544922, + "learning_rate": 4.995834151372526e-06, + "loss": 1.1211, + "step": 35 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 2.8680710792541504, + "learning_rate": 4.995585570980685e-06, + "loss": 1.0841, + "step": 36 + }, + { + "epoch": 0.2, + "grad_norm": 3.341021776199341, + "learning_rate": 4.995329793967537e-06, + "loss": 0.6182, + "step": 37 + }, + { + "epoch": 0.20540540540540542, + "grad_norm": 3.0639379024505615, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.7647, + "step": 38 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 3.225759983062744, + "learning_rate": 4.994796653048457e-06, + "loss": 0.8691, + "step": 39 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 4.56926155090332, + "learning_rate": 4.994519290679965e-06, + "loss": 1.0404, + "step": 40 + }, + { + "epoch": 0.22162162162162163, + "grad_norm": 4.871571063995361, + "learning_rate": 4.994234734765043e-06, + "loss": 1.1877, + "step": 41 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 3.672215700149536, + "learning_rate": 4.993942986124278e-06, + "loss": 0.959, + "step": 42 + }, + { + "epoch": 0.23243243243243245, + "grad_norm": 3.184683322906494, + "learning_rate": 4.9936440455989975e-06, + "loss": 0.9249, + "step": 43 + }, + { + "epoch": 0.23783783783783785, + "grad_norm": 2.7092034816741943, + "learning_rate": 4.993337914051266e-06, + "loss": 0.6899, + "step": 44 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 3.153764486312866, + "learning_rate": 4.99302459236389e-06, + "loss": 0.9075, + "step": 45 + }, + { + "epoch": 0.24864864864864866, + "grad_norm": 3.3629748821258545, + "learning_rate": 4.992704081440407e-06, + "loss": 0.785, + "step": 46 + }, + { + "epoch": 0.25405405405405407, + "grad_norm": 4.478365898132324, + "learning_rate": 4.992376382205088e-06, + "loss": 1.008, + "step": 47 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 3.4001641273498535, + "learning_rate": 4.992041495602932e-06, + "loss": 0.7751, + "step": 48 + }, + { + "epoch": 0.2648648648648649, + "grad_norm": 2.522662878036499, + "learning_rate": 4.991699422599664e-06, + "loss": 0.9022, + "step": 49 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.764458179473877, + "learning_rate": 4.991350164181735e-06, + "loss": 0.8801, + "step": 50 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 2.814859628677368, + "learning_rate": 4.990993721356317e-06, + "loss": 0.7045, + "step": 51 + }, + { + "epoch": 0.2810810810810811, + "grad_norm": 2.441311836242676, + "learning_rate": 4.990630095151296e-06, + "loss": 0.7312, + "step": 52 + }, + { + "epoch": 0.2864864864864865, + "grad_norm": 2.4443013668060303, + "learning_rate": 4.9902592866152765e-06, + "loss": 0.9609, + "step": 53 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 2.2934701442718506, + "learning_rate": 4.989881296817575e-06, + "loss": 0.5753, + "step": 54 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 2.6286847591400146, + "learning_rate": 4.989496126848215e-06, + "loss": 0.5118, + "step": 55 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 3.6817069053649902, + "learning_rate": 4.989103777817928e-06, + "loss": 1.1261, + "step": 56 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 3.011197566986084, + "learning_rate": 4.988704250858145e-06, + "loss": 0.7823, + "step": 57 + }, + { + "epoch": 0.31351351351351353, + "grad_norm": 2.5490806102752686, + "learning_rate": 4.988297547121e-06, + "loss": 0.6019, + "step": 58 + }, + { + "epoch": 0.31891891891891894, + "grad_norm": 3.0803146362304688, + "learning_rate": 4.98788366777932e-06, + "loss": 0.825, + "step": 59 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 3.015730619430542, + "learning_rate": 4.987462614026625e-06, + "loss": 0.7667, + "step": 60 + }, + { + "epoch": 0.32972972972972975, + "grad_norm": 2.5371594429016113, + "learning_rate": 4.987034387077126e-06, + "loss": 0.8051, + "step": 61 + }, + { + "epoch": 0.33513513513513515, + "grad_norm": 2.6414010524749756, + "learning_rate": 4.986598988165718e-06, + "loss": 0.6895, + "step": 62 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 3.065131187438965, + "learning_rate": 4.9861564185479785e-06, + "loss": 0.9268, + "step": 63 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 2.5708694458007812, + "learning_rate": 4.985706679500163e-06, + "loss": 0.9854, + "step": 64 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 2.768915891647339, + "learning_rate": 4.9852497723192025e-06, + "loss": 0.8083, + "step": 65 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 2.567901849746704, + "learning_rate": 4.9847856983227e-06, + "loss": 0.9098, + "step": 66 + }, + { + "epoch": 0.3621621621621622, + "grad_norm": 2.5766549110412598, + "learning_rate": 4.984314458848923e-06, + "loss": 0.8881, + "step": 67 + }, + { + "epoch": 0.3675675675675676, + "grad_norm": 2.9778389930725098, + "learning_rate": 4.983836055256804e-06, + "loss": 0.9877, + "step": 68 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 2.7225165367126465, + "learning_rate": 4.983350488925935e-06, + "loss": 0.8282, + "step": 69 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 2.702287197113037, + "learning_rate": 4.982857761256564e-06, + "loss": 1.1756, + "step": 70 + }, + { + "epoch": 0.3837837837837838, + "grad_norm": 2.9815568923950195, + "learning_rate": 4.982357873669589e-06, + "loss": 0.8114, + "step": 71 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 3.27150297164917, + "learning_rate": 4.981850827606556e-06, + "loss": 0.6763, + "step": 72 + }, + { + "epoch": 0.3945945945945946, + "grad_norm": 2.568423271179199, + "learning_rate": 4.981336624529655e-06, + "loss": 0.9372, + "step": 73 + }, + { + "epoch": 0.4, + "grad_norm": 2.621175527572632, + "learning_rate": 4.980815265921714e-06, + "loss": 1.0155, + "step": 74 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 2.62827205657959, + "learning_rate": 4.980286753286196e-06, + "loss": 0.949, + "step": 75 + }, + { + "epoch": 0.41081081081081083, + "grad_norm": 2.9462146759033203, + "learning_rate": 4.979751088147192e-06, + "loss": 1.0134, + "step": 76 + }, + { + "epoch": 0.41621621621621624, + "grad_norm": 2.814852714538574, + "learning_rate": 4.979208272049425e-06, + "loss": 0.9722, + "step": 77 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 4.177679538726807, + "learning_rate": 4.978658306558235e-06, + "loss": 1.2259, + "step": 78 + }, + { + "epoch": 0.42702702702702705, + "grad_norm": 2.813084125518799, + "learning_rate": 4.978101193259578e-06, + "loss": 0.834, + "step": 79 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.71824049949646, + "learning_rate": 4.977536933760025e-06, + "loss": 0.6151, + "step": 80 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 4.992153167724609, + "learning_rate": 4.976965529686755e-06, + "loss": 1.0475, + "step": 81 + }, + { + "epoch": 0.44324324324324327, + "grad_norm": 2.4810822010040283, + "learning_rate": 4.976386982687548e-06, + "loss": 0.8324, + "step": 82 + }, + { + "epoch": 0.4486486486486487, + "grad_norm": 4.509149074554443, + "learning_rate": 4.9758012944307845e-06, + "loss": 0.997, + "step": 83 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 3.114325761795044, + "learning_rate": 4.975208466605436e-06, + "loss": 1.2024, + "step": 84 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 3.297091007232666, + "learning_rate": 4.974608500921064e-06, + "loss": 0.9146, + "step": 85 + }, + { + "epoch": 0.4648648648648649, + "grad_norm": 2.824475049972534, + "learning_rate": 4.974001399107816e-06, + "loss": 0.7181, + "step": 86 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 20.262290954589844, + "learning_rate": 4.973387162916415e-06, + "loss": 0.8599, + "step": 87 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 4.015744686126709, + "learning_rate": 4.972765794118158e-06, + "loss": 0.6081, + "step": 88 + }, + { + "epoch": 0.4810810810810811, + "grad_norm": 2.8033058643341064, + "learning_rate": 4.9721372945049114e-06, + "loss": 0.8764, + "step": 89 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 5.271846294403076, + "learning_rate": 4.971501665889107e-06, + "loss": 0.8622, + "step": 90 + }, + { + "epoch": 0.4918918918918919, + "grad_norm": 2.557264804840088, + "learning_rate": 4.9708589101037306e-06, + "loss": 0.5523, + "step": 91 + }, + { + "epoch": 0.4972972972972973, + "grad_norm": 4.342173099517822, + "learning_rate": 4.970209029002325e-06, + "loss": 0.8922, + "step": 92 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 2.950364351272583, + "learning_rate": 4.969552024458977e-06, + "loss": 0.9455, + "step": 93 + }, + { + "epoch": 0.5081081081081081, + "grad_norm": 2.6453042030334473, + "learning_rate": 4.968887898368318e-06, + "loss": 0.8342, + "step": 94 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 3.486766815185547, + "learning_rate": 4.968216652645515e-06, + "loss": 0.8476, + "step": 95 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 2.884152889251709, + "learning_rate": 4.967538289226268e-06, + "loss": 0.8879, + "step": 96 + }, + { + "epoch": 0.5243243243243243, + "grad_norm": 2.4130594730377197, + "learning_rate": 4.966852810066798e-06, + "loss": 0.7114, + "step": 97 + }, + { + "epoch": 0.5297297297297298, + "grad_norm": 3.182410955429077, + "learning_rate": 4.9661602171438524e-06, + "loss": 0.6757, + "step": 98 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 2.5027542114257812, + "learning_rate": 4.965460512454687e-06, + "loss": 0.8029, + "step": 99 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.3096024990081787, + "learning_rate": 4.964753698017071e-06, + "loss": 0.842, + "step": 100 + }, + { + "epoch": 0.5459459459459459, + "grad_norm": 2.875657081604004, + "learning_rate": 4.964039775869271e-06, + "loss": 0.6339, + "step": 101 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 2.505406141281128, + "learning_rate": 4.963318748070056e-06, + "loss": 0.7743, + "step": 102 + }, + { + "epoch": 0.5567567567567567, + "grad_norm": 3.552562713623047, + "learning_rate": 4.9625906166986815e-06, + "loss": 0.926, + "step": 103 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 2.717942476272583, + "learning_rate": 4.961855383854889e-06, + "loss": 0.7037, + "step": 104 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 2.5049386024475098, + "learning_rate": 4.961113051658901e-06, + "loss": 0.561, + "step": 105 + }, + { + "epoch": 0.572972972972973, + "grad_norm": 2.3112900257110596, + "learning_rate": 4.96036362225141e-06, + "loss": 0.7316, + "step": 106 + }, + { + "epoch": 0.5783783783783784, + "grad_norm": 2.470257520675659, + "learning_rate": 4.959607097793575e-06, + "loss": 0.6426, + "step": 107 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 3.8040788173675537, + "learning_rate": 4.9588434804670176e-06, + "loss": 1.0044, + "step": 108 + }, + { + "epoch": 0.5891891891891892, + "grad_norm": 3.143547296524048, + "learning_rate": 4.958072772473812e-06, + "loss": 0.9219, + "step": 109 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 3.5052590370178223, + "learning_rate": 4.9572949760364795e-06, + "loss": 0.6056, + "step": 110 + }, + { + "epoch": 0.6, + "grad_norm": 3.064009428024292, + "learning_rate": 4.9565100933979835e-06, + "loss": 0.6346, + "step": 111 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 2.694610595703125, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.9856, + "step": 112 + }, + { + "epoch": 0.6108108108108108, + "grad_norm": 2.5885775089263916, + "learning_rate": 4.954919078591521e-06, + "loss": 0.8669, + "step": 113 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 2.593609571456909, + "learning_rate": 4.954112951011628e-06, + "loss": 0.7201, + "step": 114 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 3.3045759201049805, + "learning_rate": 4.9532997464067065e-06, + "loss": 0.9095, + "step": 115 + }, + { + "epoch": 0.6270270270270271, + "grad_norm": 2.8144869804382324, + "learning_rate": 4.952479467121828e-06, + "loss": 1.0213, + "step": 116 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 2.5460312366485596, + "learning_rate": 4.951652115522463e-06, + "loss": 1.1154, + "step": 117 + }, + { + "epoch": 0.6378378378378379, + "grad_norm": 2.795137405395508, + "learning_rate": 4.950817693994481e-06, + "loss": 0.691, + "step": 118 + }, + { + "epoch": 0.6432432432432432, + "grad_norm": 2.4979195594787598, + "learning_rate": 4.949976204944135e-06, + "loss": 0.7224, + "step": 119 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 3.3131983280181885, + "learning_rate": 4.949127650798063e-06, + "loss": 0.9256, + "step": 120 + }, + { + "epoch": 0.654054054054054, + "grad_norm": 2.9060285091400146, + "learning_rate": 4.948272034003275e-06, + "loss": 0.6892, + "step": 121 + }, + { + "epoch": 0.6594594594594595, + "grad_norm": 3.695594549179077, + "learning_rate": 4.947409357027148e-06, + "loss": 0.5878, + "step": 122 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 3.1250460147857666, + "learning_rate": 4.9465396223574165e-06, + "loss": 0.9904, + "step": 123 + }, + { + "epoch": 0.6702702702702703, + "grad_norm": 4.024891376495361, + "learning_rate": 4.945662832502172e-06, + "loss": 1.1592, + "step": 124 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 2.6886494159698486, + "learning_rate": 4.944778989989847e-06, + "loss": 1.0041, + "step": 125 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 2.366912841796875, + "learning_rate": 4.943888097369216e-06, + "loss": 0.7045, + "step": 126 + }, + { + "epoch": 0.6864864864864865, + "grad_norm": 2.394932270050049, + "learning_rate": 4.942990157209381e-06, + "loss": 0.6685, + "step": 127 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 2.61933970451355, + "learning_rate": 4.9420851720997674e-06, + "loss": 0.8812, + "step": 128 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 2.7395646572113037, + "learning_rate": 4.94117314465012e-06, + "loss": 1.3014, + "step": 129 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 3.065484046936035, + "learning_rate": 4.940254077490487e-06, + "loss": 0.6978, + "step": 130 + }, + { + "epoch": 0.7081081081081081, + "grad_norm": 2.895038366317749, + "learning_rate": 4.939327973271222e-06, + "loss": 0.6249, + "step": 131 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 3.1773312091827393, + "learning_rate": 4.9383948346629665e-06, + "loss": 0.6423, + "step": 132 + }, + { + "epoch": 0.7189189189189189, + "grad_norm": 2.2378008365631104, + "learning_rate": 4.937454664356652e-06, + "loss": 0.7193, + "step": 133 + }, + { + "epoch": 0.7243243243243244, + "grad_norm": 2.5673701763153076, + "learning_rate": 4.9365074650634855e-06, + "loss": 0.7065, + "step": 134 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 2.7348387241363525, + "learning_rate": 4.9355532395149445e-06, + "loss": 1.0046, + "step": 135 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 2.391741991043091, + "learning_rate": 4.9345919904627655e-06, + "loss": 0.6771, + "step": 136 + }, + { + "epoch": 0.7405405405405405, + "grad_norm": 2.2096705436706543, + "learning_rate": 4.933623720678944e-06, + "loss": 0.6589, + "step": 137 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 3.0840072631835938, + "learning_rate": 4.932648432955718e-06, + "loss": 0.8755, + "step": 138 + }, + { + "epoch": 0.7513513513513513, + "grad_norm": 2.4970428943634033, + "learning_rate": 4.931666130105564e-06, + "loss": 0.6685, + "step": 139 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 4.315455436706543, + "learning_rate": 4.930676814961189e-06, + "loss": 0.8101, + "step": 140 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 5.388065814971924, + "learning_rate": 4.92968049037552e-06, + "loss": 0.8193, + "step": 141 + }, + { + "epoch": 0.7675675675675676, + "grad_norm": 2.6107139587402344, + "learning_rate": 4.9286771592217005e-06, + "loss": 0.7852, + "step": 142 + }, + { + "epoch": 0.772972972972973, + "grad_norm": 3.936556577682495, + "learning_rate": 4.927666824393076e-06, + "loss": 1.0388, + "step": 143 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 2.74424409866333, + "learning_rate": 4.926649488803191e-06, + "loss": 0.8266, + "step": 144 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 2.8998451232910156, + "learning_rate": 4.925625155385776e-06, + "loss": 0.4895, + "step": 145 + }, + { + "epoch": 0.7891891891891892, + "grad_norm": 3.0631520748138428, + "learning_rate": 4.924593827094743e-06, + "loss": 0.8759, + "step": 146 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 3.233267307281494, + "learning_rate": 4.923555506904176e-06, + "loss": 0.701, + "step": 147 + }, + { + "epoch": 0.8, + "grad_norm": 2.87701416015625, + "learning_rate": 4.922510197808321e-06, + "loss": 1.1327, + "step": 148 + }, + { + "epoch": 0.8054054054054054, + "grad_norm": 3.650576114654541, + "learning_rate": 4.921457902821578e-06, + "loss": 0.7587, + "step": 149 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 3.232112407684326, + "learning_rate": 4.920398624978493e-06, + "loss": 1.2158, + "step": 150 + }, + { + "epoch": 0.8162162162162162, + "grad_norm": 2.468384027481079, + "learning_rate": 4.919332367333748e-06, + "loss": 0.6852, + "step": 151 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 2.5947415828704834, + "learning_rate": 4.918259132962154e-06, + "loss": 0.6611, + "step": 152 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 3.0171427726745605, + "learning_rate": 4.917178924958638e-06, + "loss": 0.7327, + "step": 153 + }, + { + "epoch": 0.8324324324324325, + "grad_norm": 3.293184518814087, + "learning_rate": 4.916091746438243e-06, + "loss": 0.8528, + "step": 154 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 4.0570969581604, + "learning_rate": 4.9149976005361085e-06, + "loss": 0.9141, + "step": 155 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 2.8782784938812256, + "learning_rate": 4.913896490407467e-06, + "loss": 1.1132, + "step": 156 + }, + { + "epoch": 0.8486486486486486, + "grad_norm": 2.5671517848968506, + "learning_rate": 4.912788419227635e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.8540540540540541, + "grad_norm": 2.9445390701293945, + "learning_rate": 4.911673390192002e-06, + "loss": 0.9227, + "step": 158 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 2.472595453262329, + "learning_rate": 4.910551406516023e-06, + "loss": 0.8154, + "step": 159 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 2.5233397483825684, + "learning_rate": 4.909422471435207e-06, + "loss": 0.9897, + "step": 160 + }, + { + "epoch": 0.8702702702702703, + "grad_norm": 3.3919546604156494, + "learning_rate": 4.90828658820511e-06, + "loss": 0.6162, + "step": 161 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 3.060908555984497, + "learning_rate": 4.907143760101325e-06, + "loss": 0.5734, + "step": 162 + }, + { + "epoch": 0.8810810810810811, + "grad_norm": 3.4584782123565674, + "learning_rate": 4.905993990419472e-06, + "loss": 0.8328, + "step": 163 + }, + { + "epoch": 0.8864864864864865, + "grad_norm": 2.936570644378662, + "learning_rate": 4.904837282475187e-06, + "loss": 0.6787, + "step": 164 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 2.564837694168091, + "learning_rate": 4.9036736396041165e-06, + "loss": 0.9658, + "step": 165 + }, + { + "epoch": 0.8972972972972973, + "grad_norm": 3.2509360313415527, + "learning_rate": 4.902503065161905e-06, + "loss": 0.7899, + "step": 166 + }, + { + "epoch": 0.9027027027027027, + "grad_norm": 2.9730329513549805, + "learning_rate": 4.901325562524185e-06, + "loss": 0.9476, + "step": 167 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 3.044980049133301, + "learning_rate": 4.900141135086569e-06, + "loss": 0.7589, + "step": 168 + }, + { + "epoch": 0.9135135135135135, + "grad_norm": 3.030585527420044, + "learning_rate": 4.898949786264638e-06, + "loss": 0.6724, + "step": 169 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 2.249122142791748, + "learning_rate": 4.897751519493933e-06, + "loss": 0.6968, + "step": 170 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 2.9816982746124268, + "learning_rate": 4.896546338229945e-06, + "loss": 0.7984, + "step": 171 + }, + { + "epoch": 0.9297297297297298, + "grad_norm": 2.415736675262451, + "learning_rate": 4.8953342459481034e-06, + "loss": 0.6109, + "step": 172 + }, + { + "epoch": 0.9351351351351351, + "grad_norm": 2.740518808364868, + "learning_rate": 4.894115246143768e-06, + "loss": 0.8126, + "step": 173 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 2.7610201835632324, + "learning_rate": 4.892889342332218e-06, + "loss": 0.6862, + "step": 174 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 3.057025194168091, + "learning_rate": 4.891656538048642e-06, + "loss": 0.9895, + "step": 175 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 2.569751262664795, + "learning_rate": 4.890416836848128e-06, + "loss": 0.8481, + "step": 176 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 2.4443397521972656, + "learning_rate": 4.889170242305652e-06, + "loss": 0.6478, + "step": 177 + }, + { + "epoch": 0.9621621621621622, + "grad_norm": 2.5009846687316895, + "learning_rate": 4.887916758016069e-06, + "loss": 0.9714, + "step": 178 + }, + { + "epoch": 0.9675675675675676, + "grad_norm": 3.101975202560425, + "learning_rate": 4.886656387594104e-06, + "loss": 1.1264, + "step": 179 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 2.6144704818725586, + "learning_rate": 4.885389134674338e-06, + "loss": 0.7664, + "step": 180 + }, + { + "epoch": 0.9783783783783784, + "grad_norm": 2.5834381580352783, + "learning_rate": 4.884115002911197e-06, + "loss": 0.6131, + "step": 181 + }, + { + "epoch": 0.9837837837837838, + "grad_norm": 2.5378055572509766, + "learning_rate": 4.88283399597895e-06, + "loss": 0.8733, + "step": 182 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 2.4095377922058105, + "learning_rate": 4.881546117571686e-06, + "loss": 0.643, + "step": 183 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 2.9554507732391357, + "learning_rate": 4.8802513714033135e-06, + "loss": 0.7287, + "step": 184 + }, + { + "epoch": 1.0, + "grad_norm": 2.8279213905334473, + "learning_rate": 4.878949761207545e-06, + "loss": 0.9927, + "step": 185 + }, + { + "epoch": 1.0054054054054054, + "grad_norm": 2.9361412525177, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.66, + "step": 186 + }, + { + "epoch": 1.0108108108108107, + "grad_norm": 3.392244338989258, + "learning_rate": 4.876325963767623e-06, + "loss": 0.594, + "step": 187 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 2.6276044845581055, + "learning_rate": 4.875003784089822e-06, + "loss": 0.5825, + "step": 188 + }, + { + "epoch": 1.0216216216216216, + "grad_norm": 2.2875545024871826, + "learning_rate": 4.873674755517305e-06, + "loss": 0.6594, + "step": 189 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 2.8086795806884766, + "learning_rate": 4.872338881882645e-06, + "loss": 0.7536, + "step": 190 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 2.3685200214385986, + "learning_rate": 4.870996167038154e-06, + "loss": 0.4849, + "step": 191 + }, + { + "epoch": 1.037837837837838, + "grad_norm": 3.0264766216278076, + "learning_rate": 4.869646614855877e-06, + "loss": 0.3771, + "step": 192 + }, + { + "epoch": 1.0432432432432432, + "grad_norm": 4.335122108459473, + "learning_rate": 4.868290229227567e-06, + "loss": 0.8545, + "step": 193 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 3.442172050476074, + "learning_rate": 4.866927014064692e-06, + "loss": 0.3698, + "step": 194 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 3.326539993286133, + "learning_rate": 4.86555697329841e-06, + "loss": 0.8468, + "step": 195 + }, + { + "epoch": 1.0594594594594595, + "grad_norm": 3.0372447967529297, + "learning_rate": 4.864180110879562e-06, + "loss": 0.8232, + "step": 196 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 2.955343008041382, + "learning_rate": 4.862796430778663e-06, + "loss": 0.4097, + "step": 197 + }, + { + "epoch": 1.0702702702702702, + "grad_norm": 2.4095399379730225, + "learning_rate": 4.861405936985889e-06, + "loss": 0.6746, + "step": 198 + }, + { + "epoch": 1.0756756756756758, + "grad_norm": 2.763500452041626, + "learning_rate": 4.860008633511059e-06, + "loss": 0.6605, + "step": 199 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 2.6751155853271484, + "learning_rate": 4.8586045243836384e-06, + "loss": 0.471, + "step": 200 + }, + { + "epoch": 1.0864864864864865, + "grad_norm": 3.3507862091064453, + "learning_rate": 4.857193613652711e-06, + "loss": 0.7665, + "step": 201 + }, + { + "epoch": 1.0918918918918918, + "grad_norm": 3.3064827919006348, + "learning_rate": 4.8557759053869775e-06, + "loss": 0.6436, + "step": 202 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 2.571828603744507, + "learning_rate": 4.854351403674741e-06, + "loss": 0.4642, + "step": 203 + }, + { + "epoch": 1.1027027027027028, + "grad_norm": 2.883220911026001, + "learning_rate": 4.852920112623895e-06, + "loss": 0.5737, + "step": 204 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 3.026144027709961, + "learning_rate": 4.851482036361912e-06, + "loss": 0.7302, + "step": 205 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 2.6689612865448, + "learning_rate": 4.850037179035829e-06, + "loss": 0.5229, + "step": 206 + }, + { + "epoch": 1.118918918918919, + "grad_norm": 2.4019956588745117, + "learning_rate": 4.8485855448122425e-06, + "loss": 0.5529, + "step": 207 + }, + { + "epoch": 1.1243243243243244, + "grad_norm": 2.3546230792999268, + "learning_rate": 4.847127137877286e-06, + "loss": 0.3635, + "step": 208 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 2.999096393585205, + "learning_rate": 4.8456619624366285e-06, + "loss": 0.8149, + "step": 209 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 10.072900772094727, + "learning_rate": 4.844190022715456e-06, + "loss": 0.8333, + "step": 210 + }, + { + "epoch": 1.1405405405405404, + "grad_norm": 2.222123384475708, + "learning_rate": 4.84271132295846e-06, + "loss": 0.3717, + "step": 211 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 2.8751113414764404, + "learning_rate": 4.841225867429826e-06, + "loss": 0.5994, + "step": 212 + }, + { + "epoch": 1.1513513513513514, + "grad_norm": 2.9580111503601074, + "learning_rate": 4.839733660413224e-06, + "loss": 0.8382, + "step": 213 + }, + { + "epoch": 1.1567567567567567, + "grad_norm": 4.628892421722412, + "learning_rate": 4.838234706211792e-06, + "loss": 0.818, + "step": 214 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 2.5103509426116943, + "learning_rate": 4.836729009148124e-06, + "loss": 0.4267, + "step": 215 + }, + { + "epoch": 1.1675675675675676, + "grad_norm": 2.6093738079071045, + "learning_rate": 4.835216573564261e-06, + "loss": 0.3472, + "step": 216 + }, + { + "epoch": 1.172972972972973, + "grad_norm": 3.0792338848114014, + "learning_rate": 4.833697403821672e-06, + "loss": 0.6323, + "step": 217 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 2.845163345336914, + "learning_rate": 4.8321715043012516e-06, + "loss": 0.6831, + "step": 218 + }, + { + "epoch": 1.1837837837837837, + "grad_norm": 3.0433948040008545, + "learning_rate": 4.830638879403296e-06, + "loss": 0.3682, + "step": 219 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 2.6533594131469727, + "learning_rate": 4.8290995335475e-06, + "loss": 0.4154, + "step": 220 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 2.9271352291107178, + "learning_rate": 4.827553471172935e-06, + "loss": 0.3991, + "step": 221 + }, + { + "epoch": 1.2, + "grad_norm": 2.9243528842926025, + "learning_rate": 4.826000696738045e-06, + "loss": 0.4538, + "step": 222 + }, + { + "epoch": 1.2054054054054055, + "grad_norm": 2.537332534790039, + "learning_rate": 4.824441214720629e-06, + "loss": 0.7692, + "step": 223 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 3.9193246364593506, + "learning_rate": 4.8228750296178275e-06, + "loss": 0.6038, + "step": 224 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 2.6646728515625, + "learning_rate": 4.821302145946113e-06, + "loss": 0.4147, + "step": 225 + }, + { + "epoch": 1.2216216216216216, + "grad_norm": 2.6519482135772705, + "learning_rate": 4.819722568241274e-06, + "loss": 0.5398, + "step": 226 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 2.2018048763275146, + "learning_rate": 4.818136301058401e-06, + "loss": 0.3864, + "step": 227 + }, + { + "epoch": 1.2324324324324325, + "grad_norm": 2.5660712718963623, + "learning_rate": 4.816543348971879e-06, + "loss": 0.5712, + "step": 228 + }, + { + "epoch": 1.2378378378378379, + "grad_norm": 3.237663745880127, + "learning_rate": 4.814943716575368e-06, + "loss": 0.662, + "step": 229 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 2.5570430755615234, + "learning_rate": 4.813337408481793e-06, + "loss": 0.8661, + "step": 230 + }, + { + "epoch": 1.2486486486486488, + "grad_norm": 2.9231269359588623, + "learning_rate": 4.811724429323329e-06, + "loss": 0.9218, + "step": 231 + }, + { + "epoch": 1.2540540540540541, + "grad_norm": 3.637084722518921, + "learning_rate": 4.810104783751389e-06, + "loss": 0.5597, + "step": 232 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 3.0218842029571533, + "learning_rate": 4.8084784764366125e-06, + "loss": 0.4786, + "step": 233 + }, + { + "epoch": 1.2648648648648648, + "grad_norm": 2.770214080810547, + "learning_rate": 4.806845512068846e-06, + "loss": 0.5219, + "step": 234 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 3.093053102493286, + "learning_rate": 4.805205895357137e-06, + "loss": 0.643, + "step": 235 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 2.6373348236083984, + "learning_rate": 4.803559631029713e-06, + "loss": 0.5858, + "step": 236 + }, + { + "epoch": 1.281081081081081, + "grad_norm": 2.452030897140503, + "learning_rate": 4.801906723833973e-06, + "loss": 0.4185, + "step": 237 + }, + { + "epoch": 1.2864864864864864, + "grad_norm": 2.72564697265625, + "learning_rate": 4.8002471785364734e-06, + "loss": 0.4917, + "step": 238 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 3.0389158725738525, + "learning_rate": 4.798580999922913e-06, + "loss": 0.645, + "step": 239 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 3.7002289295196533, + "learning_rate": 4.796908192798117e-06, + "loss": 0.5378, + "step": 240 + }, + { + "epoch": 1.3027027027027027, + "grad_norm": 2.1876111030578613, + "learning_rate": 4.7952287619860276e-06, + "loss": 0.5197, + "step": 241 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 3.903337240219116, + "learning_rate": 4.793542712329689e-06, + "loss": 1.0226, + "step": 242 + }, + { + "epoch": 1.3135135135135134, + "grad_norm": 2.3623552322387695, + "learning_rate": 4.791850048691228e-06, + "loss": 0.5502, + "step": 243 + }, + { + "epoch": 1.318918918918919, + "grad_norm": 3.0669031143188477, + "learning_rate": 4.79015077595185e-06, + "loss": 0.6976, + "step": 244 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 3.1480472087860107, + "learning_rate": 4.788444899011816e-06, + "loss": 0.4795, + "step": 245 + }, + { + "epoch": 1.3297297297297297, + "grad_norm": 3.7051920890808105, + "learning_rate": 4.786732422790432e-06, + "loss": 0.6526, + "step": 246 + }, + { + "epoch": 1.3351351351351353, + "grad_norm": 3.4358389377593994, + "learning_rate": 4.785013352226036e-06, + "loss": 0.5551, + "step": 247 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 2.3789355754852295, + "learning_rate": 4.7832876922759805e-06, + "loss": 0.3151, + "step": 248 + }, + { + "epoch": 1.345945945945946, + "grad_norm": 2.4843716621398926, + "learning_rate": 4.781555447916622e-06, + "loss": 0.6713, + "step": 249 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 3.0176303386688232, + "learning_rate": 4.779816624143302e-06, + "loss": 0.437, + "step": 250 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 2.868350028991699, + "learning_rate": 4.77807122597034e-06, + "loss": 0.7632, + "step": 251 + }, + { + "epoch": 1.3621621621621622, + "grad_norm": 2.4629738330841064, + "learning_rate": 4.776319258431009e-06, + "loss": 0.4894, + "step": 252 + }, + { + "epoch": 1.3675675675675676, + "grad_norm": 2.798297882080078, + "learning_rate": 4.77456072657753e-06, + "loss": 0.4456, + "step": 253 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 3.2977547645568848, + "learning_rate": 4.772795635481053e-06, + "loss": 0.5381, + "step": 254 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 4.1061906814575195, + "learning_rate": 4.77102399023164e-06, + "loss": 1.0302, + "step": 255 + }, + { + "epoch": 1.3837837837837839, + "grad_norm": 3.943284511566162, + "learning_rate": 4.769245795938261e-06, + "loss": 0.4875, + "step": 256 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 2.6420533657073975, + "learning_rate": 4.767461057728763e-06, + "loss": 0.4923, + "step": 257 + }, + { + "epoch": 1.3945945945945946, + "grad_norm": 3.3152263164520264, + "learning_rate": 4.76566978074987e-06, + "loss": 0.6699, + "step": 258 + }, + { + "epoch": 1.4, + "grad_norm": 2.6928882598876953, + "learning_rate": 4.7638719701671586e-06, + "loss": 0.6117, + "step": 259 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 2.706597328186035, + "learning_rate": 4.762067631165049e-06, + "loss": 0.8534, + "step": 260 + }, + { + "epoch": 1.4108108108108108, + "grad_norm": 2.9912848472595215, + "learning_rate": 4.760256768946787e-06, + "loss": 0.5057, + "step": 261 + }, + { + "epoch": 1.4162162162162162, + "grad_norm": 2.7098443508148193, + "learning_rate": 4.758439388734429e-06, + "loss": 0.7286, + "step": 262 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 3.1288092136383057, + "learning_rate": 4.7566154957688276e-06, + "loss": 0.9827, + "step": 263 + }, + { + "epoch": 1.427027027027027, + "grad_norm": 3.0505919456481934, + "learning_rate": 4.754785095309617e-06, + "loss": 0.7042, + "step": 264 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 2.6800339221954346, + "learning_rate": 4.752948192635199e-06, + "loss": 0.5179, + "step": 265 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 2.2246861457824707, + "learning_rate": 4.751104793042722e-06, + "loss": 0.8527, + "step": 266 + }, + { + "epoch": 1.4432432432432432, + "grad_norm": 2.4242751598358154, + "learning_rate": 4.7492549018480725e-06, + "loss": 0.5627, + "step": 267 + }, + { + "epoch": 1.4486486486486487, + "grad_norm": 2.763244152069092, + "learning_rate": 4.747398524385858e-06, + "loss": 0.8981, + "step": 268 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 2.856595993041992, + "learning_rate": 4.745535666009389e-06, + "loss": 0.5455, + "step": 269 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 2.4168624877929688, + "learning_rate": 4.743666332090664e-06, + "loss": 0.4348, + "step": 270 + }, + { + "epoch": 1.464864864864865, + "grad_norm": 2.5408060550689697, + "learning_rate": 4.74179052802036e-06, + "loss": 0.5524, + "step": 271 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 2.6216673851013184, + "learning_rate": 4.739908259207807e-06, + "loss": 0.7469, + "step": 272 + }, + { + "epoch": 1.4756756756756757, + "grad_norm": 5.397300720214844, + "learning_rate": 4.738019531080981e-06, + "loss": 0.7216, + "step": 273 + }, + { + "epoch": 1.481081081081081, + "grad_norm": 3.3481080532073975, + "learning_rate": 4.7361243490864825e-06, + "loss": 0.7527, + "step": 274 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 2.7943873405456543, + "learning_rate": 4.734222718689527e-06, + "loss": 0.7437, + "step": 275 + }, + { + "epoch": 1.491891891891892, + "grad_norm": 2.206890344619751, + "learning_rate": 4.732314645373922e-06, + "loss": 0.5187, + "step": 276 + }, + { + "epoch": 1.4972972972972973, + "grad_norm": 2.76442813873291, + "learning_rate": 4.730400134642055e-06, + "loss": 0.7186, + "step": 277 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 3.4754087924957275, + "learning_rate": 4.728479192014879e-06, + "loss": 0.9655, + "step": 278 + }, + { + "epoch": 1.5081081081081082, + "grad_norm": 2.923779249191284, + "learning_rate": 4.726551823031895e-06, + "loss": 0.6251, + "step": 279 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 3.1142773628234863, + "learning_rate": 4.7246180332511335e-06, + "loss": 0.4805, + "step": 280 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 2.3477070331573486, + "learning_rate": 4.722677828249142e-06, + "loss": 1.0939, + "step": 281 + }, + { + "epoch": 1.5243243243243243, + "grad_norm": 2.8418569564819336, + "learning_rate": 4.720731213620972e-06, + "loss": 0.9485, + "step": 282 + }, + { + "epoch": 1.5297297297297296, + "grad_norm": 2.462710380554199, + "learning_rate": 4.718778194980152e-06, + "loss": 0.5805, + "step": 283 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 3.2379209995269775, + "learning_rate": 4.7168187779586805e-06, + "loss": 0.77, + "step": 284 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 3.0701661109924316, + "learning_rate": 4.71485296820701e-06, + "loss": 0.5932, + "step": 285 + }, + { + "epoch": 1.545945945945946, + "grad_norm": 4.099547386169434, + "learning_rate": 4.7128807713940245e-06, + "loss": 0.6296, + "step": 286 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 2.5529167652130127, + "learning_rate": 4.710902193207028e-06, + "loss": 0.6201, + "step": 287 + }, + { + "epoch": 1.5567567567567568, + "grad_norm": 2.794926881790161, + "learning_rate": 4.708917239351727e-06, + "loss": 0.5682, + "step": 288 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 3.2522501945495605, + "learning_rate": 4.706925915552214e-06, + "loss": 0.8877, + "step": 289 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 2.811847448348999, + "learning_rate": 4.704928227550949e-06, + "loss": 0.6521, + "step": 290 + }, + { + "epoch": 1.572972972972973, + "grad_norm": 2.7060673236846924, + "learning_rate": 4.702924181108745e-06, + "loss": 0.4929, + "step": 291 + }, + { + "epoch": 1.5783783783783782, + "grad_norm": 2.5009031295776367, + "learning_rate": 4.700913782004755e-06, + "loss": 0.4515, + "step": 292 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 2.6722700595855713, + "learning_rate": 4.698897036036446e-06, + "loss": 0.5477, + "step": 293 + }, + { + "epoch": 1.5891891891891892, + "grad_norm": 3.3333957195281982, + "learning_rate": 4.696873949019591e-06, + "loss": 0.9589, + "step": 294 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 2.4862897396087646, + "learning_rate": 4.694844526788248e-06, + "loss": 0.4425, + "step": 295 + }, + { + "epoch": 1.6, + "grad_norm": 2.78708553314209, + "learning_rate": 4.692808775194745e-06, + "loss": 0.4899, + "step": 296 + }, + { + "epoch": 1.6054054054054054, + "grad_norm": 2.9121289253234863, + "learning_rate": 4.690766700109659e-06, + "loss": 0.4884, + "step": 297 + }, + { + "epoch": 1.6108108108108108, + "grad_norm": 4.692054271697998, + "learning_rate": 4.688718307421807e-06, + "loss": 0.8977, + "step": 298 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 3.1290926933288574, + "learning_rate": 4.686663603038222e-06, + "loss": 0.6833, + "step": 299 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 3.5091123580932617, + "learning_rate": 4.6846025928841365e-06, + "loss": 0.9141, + "step": 300 + }, + { + "epoch": 1.627027027027027, + "grad_norm": 2.5466184616088867, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.5121, + "step": 301 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 2.7833092212677, + "learning_rate": 4.68046167905631e-06, + "loss": 0.5399, + "step": 302 + }, + { + "epoch": 1.637837837837838, + "grad_norm": 3.05135440826416, + "learning_rate": 4.678381787323889e-06, + "loss": 0.7921, + "step": 303 + }, + { + "epoch": 1.6432432432432433, + "grad_norm": 2.2391726970672607, + "learning_rate": 4.676295613703577e-06, + "loss": 0.7178, + "step": 304 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 2.3654022216796875, + "learning_rate": 4.674203164211357e-06, + "loss": 0.7162, + "step": 305 + }, + { + "epoch": 1.654054054054054, + "grad_norm": 2.436009645462036, + "learning_rate": 4.67210444488131e-06, + "loss": 0.6539, + "step": 306 + }, + { + "epoch": 1.6594594594594594, + "grad_norm": 2.6034209728240967, + "learning_rate": 4.669999461765599e-06, + "loss": 0.7214, + "step": 307 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 2.804229497909546, + "learning_rate": 4.6678882209344474e-06, + "loss": 0.7451, + "step": 308 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 2.6239655017852783, + "learning_rate": 4.665770728476127e-06, + "loss": 0.6464, + "step": 309 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 2.9320099353790283, + "learning_rate": 4.663646990496939e-06, + "loss": 0.6669, + "step": 310 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 3.09713077545166, + "learning_rate": 4.661517013121189e-06, + "loss": 0.8972, + "step": 311 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 3.6576132774353027, + "learning_rate": 4.659380802491181e-06, + "loss": 0.6286, + "step": 312 + }, + { + "epoch": 1.691891891891892, + "grad_norm": 2.9320433139801025, + "learning_rate": 4.6572383647671915e-06, + "loss": 0.3631, + "step": 313 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 3.399357557296753, + "learning_rate": 4.655089706127457e-06, + "loss": 0.5682, + "step": 314 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 2.7667412757873535, + "learning_rate": 4.652934832768148e-06, + "loss": 0.5457, + "step": 315 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 2.3023321628570557, + "learning_rate": 4.650773750903363e-06, + "loss": 0.6601, + "step": 316 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 2.6584670543670654, + "learning_rate": 4.6486064667651005e-06, + "loss": 0.5882, + "step": 317 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 5.528168678283691, + "learning_rate": 4.646432986603245e-06, + "loss": 0.7628, + "step": 318 + }, + { + "epoch": 1.7243243243243245, + "grad_norm": 3.054884195327759, + "learning_rate": 4.644253316685552e-06, + "loss": 0.6877, + "step": 319 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 3.2672388553619385, + "learning_rate": 4.6420674632976205e-06, + "loss": 0.7026, + "step": 320 + }, + { + "epoch": 1.7351351351351352, + "grad_norm": 3.109384536743164, + "learning_rate": 4.639875432742886e-06, + "loss": 0.5236, + "step": 321 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 3.3593883514404297, + "learning_rate": 4.6376772313425975e-06, + "loss": 0.6463, + "step": 322 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 2.6352698802948, + "learning_rate": 4.635472865435795e-06, + "loss": 0.6903, + "step": 323 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 2.751690149307251, + "learning_rate": 4.6332623413792995e-06, + "loss": 0.7342, + "step": 324 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 2.670915126800537, + "learning_rate": 4.6310456655476874e-06, + "loss": 0.4302, + "step": 325 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 2.7648138999938965, + "learning_rate": 4.6288228443332786e-06, + "loss": 0.5108, + "step": 326 + }, + { + "epoch": 1.7675675675675677, + "grad_norm": 2.7451536655426025, + "learning_rate": 4.626593884146111e-06, + "loss": 0.7646, + "step": 327 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 2.4656403064727783, + "learning_rate": 4.624358791413928e-06, + "loss": 0.5529, + "step": 328 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 2.5987517833709717, + "learning_rate": 4.622117572582159e-06, + "loss": 0.609, + "step": 329 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 3.3843371868133545, + "learning_rate": 4.619870234113894e-06, + "loss": 0.9146, + "step": 330 + }, + { + "epoch": 1.7891891891891891, + "grad_norm": 2.3542068004608154, + "learning_rate": 4.617616782489878e-06, + "loss": 0.6887, + "step": 331 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 2.2049715518951416, + "learning_rate": 4.615357224208477e-06, + "loss": 0.505, + "step": 332 + }, + { + "epoch": 1.8, + "grad_norm": 2.453920364379883, + "learning_rate": 4.613091565785674e-06, + "loss": 0.8384, + "step": 333 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 2.5751583576202393, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5512, + "step": 334 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 2.524075984954834, + "learning_rate": 4.608541974667714e-06, + "loss": 0.4877, + "step": 335 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 2.2856955528259277, + "learning_rate": 4.606258055092397e-06, + "loss": 0.5583, + "step": 336 + }, + { + "epoch": 1.8216216216216217, + "grad_norm": 2.2773683071136475, + "learning_rate": 4.603968061615321e-06, + "loss": 0.5421, + "step": 337 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 4.085512161254883, + "learning_rate": 4.601672000840231e-06, + "loss": 0.942, + "step": 338 + }, + { + "epoch": 1.8324324324324324, + "grad_norm": 2.3710968494415283, + "learning_rate": 4.5993698793883715e-06, + "loss": 0.3773, + "step": 339 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 2.745534658432007, + "learning_rate": 4.597061703898462e-06, + "loss": 0.9694, + "step": 340 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 2.463207244873047, + "learning_rate": 4.594747481026685e-06, + "loss": 0.4667, + "step": 341 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 2.7216601371765137, + "learning_rate": 4.592427217446656e-06, + "loss": 0.4267, + "step": 342 + }, + { + "epoch": 1.8540540540540542, + "grad_norm": 2.545664072036743, + "learning_rate": 4.590100919849413e-06, + "loss": 0.9245, + "step": 343 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 3.692840337753296, + "learning_rate": 4.587768594943396e-06, + "loss": 0.7502, + "step": 344 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 2.993229627609253, + "learning_rate": 4.585430249454426e-06, + "loss": 0.4689, + "step": 345 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 2.162867546081543, + "learning_rate": 4.583085890125682e-06, + "loss": 0.6188, + "step": 346 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 2.2169792652130127, + "learning_rate": 4.5807355237176896e-06, + "loss": 0.6352, + "step": 347 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 3.978985548019409, + "learning_rate": 4.578379157008296e-06, + "loss": 0.464, + "step": 348 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 2.236682653427124, + "learning_rate": 4.57601679679265e-06, + "loss": 0.5943, + "step": 349 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 2.528754472732544, + "learning_rate": 4.573648449883188e-06, + "loss": 0.6949, + "step": 350 + }, + { + "epoch": 1.8972972972972975, + "grad_norm": 2.7673721313476562, + "learning_rate": 4.571274123109606e-06, + "loss": 0.4333, + "step": 351 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 2.698012351989746, + "learning_rate": 4.568893823318847e-06, + "loss": 0.6796, + "step": 352 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 2.9640560150146484, + "learning_rate": 4.566507557375077e-06, + "loss": 0.6139, + "step": 353 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 2.417628526687622, + "learning_rate": 4.5641153321596684e-06, + "loss": 0.4515, + "step": 354 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 2.676739454269409, + "learning_rate": 4.56171715457118e-06, + "loss": 0.8426, + "step": 355 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 2.8428189754486084, + "learning_rate": 4.559313031525331e-06, + "loss": 0.5806, + "step": 356 + }, + { + "epoch": 1.9297297297297298, + "grad_norm": 2.6817944049835205, + "learning_rate": 4.55690296995499e-06, + "loss": 0.5927, + "step": 357 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 3.5939931869506836, + "learning_rate": 4.554486976810149e-06, + "loss": 0.9986, + "step": 358 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 2.86688494682312, + "learning_rate": 4.552065059057906e-06, + "loss": 0.6813, + "step": 359 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 2.9295246601104736, + "learning_rate": 4.549637223682441e-06, + "loss": 1.0832, + "step": 360 + }, + { + "epoch": 1.9513513513513514, + "grad_norm": 2.6939451694488525, + "learning_rate": 4.547203477685005e-06, + "loss": 0.7377, + "step": 361 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 2.226055145263672, + "learning_rate": 4.544763828083888e-06, + "loss": 0.5412, + "step": 362 + }, + { + "epoch": 1.962162162162162, + "grad_norm": 2.490187406539917, + "learning_rate": 4.542318281914405e-06, + "loss": 0.6955, + "step": 363 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 2.9241302013397217, + "learning_rate": 4.53986684622888e-06, + "loss": 0.6774, + "step": 364 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 2.988084554672241, + "learning_rate": 4.537409528096615e-06, + "loss": 0.5832, + "step": 365 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 2.9380626678466797, + "learning_rate": 4.534946334603879e-06, + "loss": 0.606, + "step": 366 + }, + { + "epoch": 1.983783783783784, + "grad_norm": 2.667588710784912, + "learning_rate": 4.532477272853882e-06, + "loss": 0.4991, + "step": 367 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 2.9711899757385254, + "learning_rate": 4.530002349966759e-06, + "loss": 0.4442, + "step": 368 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 3.443957805633545, + "learning_rate": 4.5275215730795445e-06, + "loss": 0.6566, + "step": 369 + }, + { + "epoch": 2.0, + "grad_norm": 3.590317487716675, + "learning_rate": 4.525034949346156e-06, + "loss": 0.5687, + "step": 370 + } + ], + "logging_steps": 1, + "max_steps": 1850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.981871016797798e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..30d31d54f352f0c71ad48745af612a088822fa48 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 2007565312, + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b4218da5adb2605ce95b0220be4741081fd64ed --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json @@ -0,0 +1,5214 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005405405405405406, + "grad_norm": 72.60939025878906, + "learning_rate": 5e-06, + "loss": 2.9165, + "step": 1 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 29.01830291748047, + "learning_rate": 4.999996395324314e-06, + "loss": 1.9314, + "step": 2 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 21.44908332824707, + "learning_rate": 4.99998558130765e-06, + "loss": 1.5709, + "step": 3 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 4.490907669067383, + "learning_rate": 4.999967557981192e-06, + "loss": 0.8099, + "step": 4 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 4.000796794891357, + "learning_rate": 4.999942325396917e-06, + "loss": 0.9021, + "step": 5 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 18.513282775878906, + "learning_rate": 4.999909883627588e-06, + "loss": 1.7972, + "step": 6 + }, + { + "epoch": 0.03783783783783784, + "grad_norm": 3.5735981464385986, + "learning_rate": 4.999870232766757e-06, + "loss": 1.4306, + "step": 7 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 3.1145193576812744, + "learning_rate": 4.9998233729287696e-06, + "loss": 1.051, + "step": 8 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 3.856376886367798, + "learning_rate": 4.999769304248755e-06, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 4.05589485168457, + "learning_rate": 4.9997080268826344e-06, + "loss": 1.0999, + "step": 10 + }, + { + "epoch": 0.05945945945945946, + "grad_norm": 13.784229278564453, + "learning_rate": 4.9996395410071165e-06, + "loss": 1.2831, + "step": 11 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 6.079237937927246, + "learning_rate": 4.999563846819696e-06, + "loss": 1.2874, + "step": 12 + }, + { + "epoch": 0.07027027027027027, + "grad_norm": 4.5971245765686035, + "learning_rate": 4.999480944538655e-06, + "loss": 0.96, + "step": 13 + }, + { + "epoch": 0.07567567567567568, + "grad_norm": 4.916017532348633, + "learning_rate": 4.999390834403063e-06, + "loss": 0.9869, + "step": 14 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 3.2311055660247803, + "learning_rate": 4.999293516672773e-06, + "loss": 0.9293, + "step": 15 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 3.3040921688079834, + "learning_rate": 4.9991889916284255e-06, + "loss": 0.8914, + "step": 16 + }, + { + "epoch": 0.0918918918918919, + "grad_norm": 3.794267416000366, + "learning_rate": 4.999077259571442e-06, + "loss": 1.0176, + "step": 17 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 4.788509845733643, + "learning_rate": 4.998958320824031e-06, + "loss": 1.0259, + "step": 18 + }, + { + "epoch": 0.10270270270270271, + "grad_norm": 10.027527809143066, + "learning_rate": 4.998832175729179e-06, + "loss": 1.3356, + "step": 19 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 4.612483978271484, + "learning_rate": 4.998698824650656e-06, + "loss": 1.4486, + "step": 20 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 3.8676936626434326, + "learning_rate": 4.998558267973014e-06, + "loss": 0.8372, + "step": 21 + }, + { + "epoch": 0.11891891891891893, + "grad_norm": 2.9611001014709473, + "learning_rate": 4.998410506101579e-06, + "loss": 0.7931, + "step": 22 + }, + { + "epoch": 0.12432432432432433, + "grad_norm": 5.508745193481445, + "learning_rate": 4.9982555394624595e-06, + "loss": 1.3022, + "step": 23 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 3.434845209121704, + "learning_rate": 4.998093368502539e-06, + "loss": 0.9739, + "step": 24 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 4.736802101135254, + "learning_rate": 4.9979239936894765e-06, + "loss": 1.1154, + "step": 25 + }, + { + "epoch": 0.14054054054054055, + "grad_norm": 3.69411039352417, + "learning_rate": 4.997747415511705e-06, + "loss": 0.7543, + "step": 26 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 2.8646645545959473, + "learning_rate": 4.997563634478428e-06, + "loss": 0.7278, + "step": 27 + }, + { + "epoch": 0.15135135135135136, + "grad_norm": 6.56904935836792, + "learning_rate": 4.997372651119626e-06, + "loss": 0.8167, + "step": 28 + }, + { + "epoch": 0.15675675675675677, + "grad_norm": 2.955914258956909, + "learning_rate": 4.997174465986044e-06, + "loss": 0.8031, + "step": 29 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 2.5714259147644043, + "learning_rate": 4.996969079649196e-06, + "loss": 0.689, + "step": 30 + }, + { + "epoch": 0.16756756756756758, + "grad_norm": 3.5165364742279053, + "learning_rate": 4.996756492701362e-06, + "loss": 0.8059, + "step": 31 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 3.2861921787261963, + "learning_rate": 4.996536705755591e-06, + "loss": 0.9658, + "step": 32 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 2.962470531463623, + "learning_rate": 4.996309719445687e-06, + "loss": 0.8349, + "step": 33 + }, + { + "epoch": 0.1837837837837838, + "grad_norm": 2.7694804668426514, + "learning_rate": 4.996075534426223e-06, + "loss": 0.8287, + "step": 34 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 3.405071258544922, + "learning_rate": 4.995834151372526e-06, + "loss": 1.1211, + "step": 35 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 2.8680710792541504, + "learning_rate": 4.995585570980685e-06, + "loss": 1.0841, + "step": 36 + }, + { + "epoch": 0.2, + "grad_norm": 3.341021776199341, + "learning_rate": 4.995329793967537e-06, + "loss": 0.6182, + "step": 37 + }, + { + "epoch": 0.20540540540540542, + "grad_norm": 3.0639379024505615, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.7647, + "step": 38 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 3.225759983062744, + "learning_rate": 4.994796653048457e-06, + "loss": 0.8691, + "step": 39 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 4.56926155090332, + "learning_rate": 4.994519290679965e-06, + "loss": 1.0404, + "step": 40 + }, + { + "epoch": 0.22162162162162163, + "grad_norm": 4.871571063995361, + "learning_rate": 4.994234734765043e-06, + "loss": 1.1877, + "step": 41 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 3.672215700149536, + "learning_rate": 4.993942986124278e-06, + "loss": 0.959, + "step": 42 + }, + { + "epoch": 0.23243243243243245, + "grad_norm": 3.184683322906494, + "learning_rate": 4.9936440455989975e-06, + "loss": 0.9249, + "step": 43 + }, + { + "epoch": 0.23783783783783785, + "grad_norm": 2.7092034816741943, + "learning_rate": 4.993337914051266e-06, + "loss": 0.6899, + "step": 44 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 3.153764486312866, + "learning_rate": 4.99302459236389e-06, + "loss": 0.9075, + "step": 45 + }, + { + "epoch": 0.24864864864864866, + "grad_norm": 3.3629748821258545, + "learning_rate": 4.992704081440407e-06, + "loss": 0.785, + "step": 46 + }, + { + "epoch": 0.25405405405405407, + "grad_norm": 4.478365898132324, + "learning_rate": 4.992376382205088e-06, + "loss": 1.008, + "step": 47 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 3.4001641273498535, + "learning_rate": 4.992041495602932e-06, + "loss": 0.7751, + "step": 48 + }, + { + "epoch": 0.2648648648648649, + "grad_norm": 2.522662878036499, + "learning_rate": 4.991699422599664e-06, + "loss": 0.9022, + "step": 49 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.764458179473877, + "learning_rate": 4.991350164181735e-06, + "loss": 0.8801, + "step": 50 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 2.814859628677368, + "learning_rate": 4.990993721356317e-06, + "loss": 0.7045, + "step": 51 + }, + { + "epoch": 0.2810810810810811, + "grad_norm": 2.441311836242676, + "learning_rate": 4.990630095151296e-06, + "loss": 0.7312, + "step": 52 + }, + { + "epoch": 0.2864864864864865, + "grad_norm": 2.4443013668060303, + "learning_rate": 4.9902592866152765e-06, + "loss": 0.9609, + "step": 53 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 2.2934701442718506, + "learning_rate": 4.989881296817575e-06, + "loss": 0.5753, + "step": 54 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 2.6286847591400146, + "learning_rate": 4.989496126848215e-06, + "loss": 0.5118, + "step": 55 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 3.6817069053649902, + "learning_rate": 4.989103777817928e-06, + "loss": 1.1261, + "step": 56 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 3.011197566986084, + "learning_rate": 4.988704250858145e-06, + "loss": 0.7823, + "step": 57 + }, + { + "epoch": 0.31351351351351353, + "grad_norm": 2.5490806102752686, + "learning_rate": 4.988297547121e-06, + "loss": 0.6019, + "step": 58 + }, + { + "epoch": 0.31891891891891894, + "grad_norm": 3.0803146362304688, + "learning_rate": 4.98788366777932e-06, + "loss": 0.825, + "step": 59 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 3.015730619430542, + "learning_rate": 4.987462614026625e-06, + "loss": 0.7667, + "step": 60 + }, + { + "epoch": 0.32972972972972975, + "grad_norm": 2.5371594429016113, + "learning_rate": 4.987034387077126e-06, + "loss": 0.8051, + "step": 61 + }, + { + "epoch": 0.33513513513513515, + "grad_norm": 2.6414010524749756, + "learning_rate": 4.986598988165718e-06, + "loss": 0.6895, + "step": 62 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 3.065131187438965, + "learning_rate": 4.9861564185479785e-06, + "loss": 0.9268, + "step": 63 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 2.5708694458007812, + "learning_rate": 4.985706679500163e-06, + "loss": 0.9854, + "step": 64 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 2.768915891647339, + "learning_rate": 4.9852497723192025e-06, + "loss": 0.8083, + "step": 65 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 2.567901849746704, + "learning_rate": 4.9847856983227e-06, + "loss": 0.9098, + "step": 66 + }, + { + "epoch": 0.3621621621621622, + "grad_norm": 2.5766549110412598, + "learning_rate": 4.984314458848923e-06, + "loss": 0.8881, + "step": 67 + }, + { + "epoch": 0.3675675675675676, + "grad_norm": 2.9778389930725098, + "learning_rate": 4.983836055256804e-06, + "loss": 0.9877, + "step": 68 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 2.7225165367126465, + "learning_rate": 4.983350488925935e-06, + "loss": 0.8282, + "step": 69 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 2.702287197113037, + "learning_rate": 4.982857761256564e-06, + "loss": 1.1756, + "step": 70 + }, + { + "epoch": 0.3837837837837838, + "grad_norm": 2.9815568923950195, + "learning_rate": 4.982357873669589e-06, + "loss": 0.8114, + "step": 71 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 3.27150297164917, + "learning_rate": 4.981850827606556e-06, + "loss": 0.6763, + "step": 72 + }, + { + "epoch": 0.3945945945945946, + "grad_norm": 2.568423271179199, + "learning_rate": 4.981336624529655e-06, + "loss": 0.9372, + "step": 73 + }, + { + "epoch": 0.4, + "grad_norm": 2.621175527572632, + "learning_rate": 4.980815265921714e-06, + "loss": 1.0155, + "step": 74 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 2.62827205657959, + "learning_rate": 4.980286753286196e-06, + "loss": 0.949, + "step": 75 + }, + { + "epoch": 0.41081081081081083, + "grad_norm": 2.9462146759033203, + "learning_rate": 4.979751088147192e-06, + "loss": 1.0134, + "step": 76 + }, + { + "epoch": 0.41621621621621624, + "grad_norm": 2.814852714538574, + "learning_rate": 4.979208272049425e-06, + "loss": 0.9722, + "step": 77 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 4.177679538726807, + "learning_rate": 4.978658306558235e-06, + "loss": 1.2259, + "step": 78 + }, + { + "epoch": 0.42702702702702705, + "grad_norm": 2.813084125518799, + "learning_rate": 4.978101193259578e-06, + "loss": 0.834, + "step": 79 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.71824049949646, + "learning_rate": 4.977536933760025e-06, + "loss": 0.6151, + "step": 80 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 4.992153167724609, + "learning_rate": 4.976965529686755e-06, + "loss": 1.0475, + "step": 81 + }, + { + "epoch": 0.44324324324324327, + "grad_norm": 2.4810822010040283, + "learning_rate": 4.976386982687548e-06, + "loss": 0.8324, + "step": 82 + }, + { + "epoch": 0.4486486486486487, + "grad_norm": 4.509149074554443, + "learning_rate": 4.9758012944307845e-06, + "loss": 0.997, + "step": 83 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 3.114325761795044, + "learning_rate": 4.975208466605436e-06, + "loss": 1.2024, + "step": 84 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 3.297091007232666, + "learning_rate": 4.974608500921064e-06, + "loss": 0.9146, + "step": 85 + }, + { + "epoch": 0.4648648648648649, + "grad_norm": 2.824475049972534, + "learning_rate": 4.974001399107816e-06, + "loss": 0.7181, + "step": 86 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 20.262290954589844, + "learning_rate": 4.973387162916415e-06, + "loss": 0.8599, + "step": 87 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 4.015744686126709, + "learning_rate": 4.972765794118158e-06, + "loss": 0.6081, + "step": 88 + }, + { + "epoch": 0.4810810810810811, + "grad_norm": 2.8033058643341064, + "learning_rate": 4.9721372945049114e-06, + "loss": 0.8764, + "step": 89 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 5.271846294403076, + "learning_rate": 4.971501665889107e-06, + "loss": 0.8622, + "step": 90 + }, + { + "epoch": 0.4918918918918919, + "grad_norm": 2.557264804840088, + "learning_rate": 4.9708589101037306e-06, + "loss": 0.5523, + "step": 91 + }, + { + "epoch": 0.4972972972972973, + "grad_norm": 4.342173099517822, + "learning_rate": 4.970209029002325e-06, + "loss": 0.8922, + "step": 92 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 2.950364351272583, + "learning_rate": 4.969552024458977e-06, + "loss": 0.9455, + "step": 93 + }, + { + "epoch": 0.5081081081081081, + "grad_norm": 2.6453042030334473, + "learning_rate": 4.968887898368318e-06, + "loss": 0.8342, + "step": 94 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 3.486766815185547, + "learning_rate": 4.968216652645515e-06, + "loss": 0.8476, + "step": 95 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 2.884152889251709, + "learning_rate": 4.967538289226268e-06, + "loss": 0.8879, + "step": 96 + }, + { + "epoch": 0.5243243243243243, + "grad_norm": 2.4130594730377197, + "learning_rate": 4.966852810066798e-06, + "loss": 0.7114, + "step": 97 + }, + { + "epoch": 0.5297297297297298, + "grad_norm": 3.182410955429077, + "learning_rate": 4.9661602171438524e-06, + "loss": 0.6757, + "step": 98 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 2.5027542114257812, + "learning_rate": 4.965460512454687e-06, + "loss": 0.8029, + "step": 99 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.3096024990081787, + "learning_rate": 4.964753698017071e-06, + "loss": 0.842, + "step": 100 + }, + { + "epoch": 0.5459459459459459, + "grad_norm": 2.875657081604004, + "learning_rate": 4.964039775869271e-06, + "loss": 0.6339, + "step": 101 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 2.505406141281128, + "learning_rate": 4.963318748070056e-06, + "loss": 0.7743, + "step": 102 + }, + { + "epoch": 0.5567567567567567, + "grad_norm": 3.552562713623047, + "learning_rate": 4.9625906166986815e-06, + "loss": 0.926, + "step": 103 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 2.717942476272583, + "learning_rate": 4.961855383854889e-06, + "loss": 0.7037, + "step": 104 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 2.5049386024475098, + "learning_rate": 4.961113051658901e-06, + "loss": 0.561, + "step": 105 + }, + { + "epoch": 0.572972972972973, + "grad_norm": 2.3112900257110596, + "learning_rate": 4.96036362225141e-06, + "loss": 0.7316, + "step": 106 + }, + { + "epoch": 0.5783783783783784, + "grad_norm": 2.470257520675659, + "learning_rate": 4.959607097793575e-06, + "loss": 0.6426, + "step": 107 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 3.8040788173675537, + "learning_rate": 4.9588434804670176e-06, + "loss": 1.0044, + "step": 108 + }, + { + "epoch": 0.5891891891891892, + "grad_norm": 3.143547296524048, + "learning_rate": 4.958072772473812e-06, + "loss": 0.9219, + "step": 109 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 3.5052590370178223, + "learning_rate": 4.9572949760364795e-06, + "loss": 0.6056, + "step": 110 + }, + { + "epoch": 0.6, + "grad_norm": 3.064009428024292, + "learning_rate": 4.9565100933979835e-06, + "loss": 0.6346, + "step": 111 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 2.694610595703125, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.9856, + "step": 112 + }, + { + "epoch": 0.6108108108108108, + "grad_norm": 2.5885775089263916, + "learning_rate": 4.954919078591521e-06, + "loss": 0.8669, + "step": 113 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 2.593609571456909, + "learning_rate": 4.954112951011628e-06, + "loss": 0.7201, + "step": 114 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 3.3045759201049805, + "learning_rate": 4.9532997464067065e-06, + "loss": 0.9095, + "step": 115 + }, + { + "epoch": 0.6270270270270271, + "grad_norm": 2.8144869804382324, + "learning_rate": 4.952479467121828e-06, + "loss": 1.0213, + "step": 116 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 2.5460312366485596, + "learning_rate": 4.951652115522463e-06, + "loss": 1.1154, + "step": 117 + }, + { + "epoch": 0.6378378378378379, + "grad_norm": 2.795137405395508, + "learning_rate": 4.950817693994481e-06, + "loss": 0.691, + "step": 118 + }, + { + "epoch": 0.6432432432432432, + "grad_norm": 2.4979195594787598, + "learning_rate": 4.949976204944135e-06, + "loss": 0.7224, + "step": 119 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 3.3131983280181885, + "learning_rate": 4.949127650798063e-06, + "loss": 0.9256, + "step": 120 + }, + { + "epoch": 0.654054054054054, + "grad_norm": 2.9060285091400146, + "learning_rate": 4.948272034003275e-06, + "loss": 0.6892, + "step": 121 + }, + { + "epoch": 0.6594594594594595, + "grad_norm": 3.695594549179077, + "learning_rate": 4.947409357027148e-06, + "loss": 0.5878, + "step": 122 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 3.1250460147857666, + "learning_rate": 4.9465396223574165e-06, + "loss": 0.9904, + "step": 123 + }, + { + "epoch": 0.6702702702702703, + "grad_norm": 4.024891376495361, + "learning_rate": 4.945662832502172e-06, + "loss": 1.1592, + "step": 124 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 2.6886494159698486, + "learning_rate": 4.944778989989847e-06, + "loss": 1.0041, + "step": 125 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 2.366912841796875, + "learning_rate": 4.943888097369216e-06, + "loss": 0.7045, + "step": 126 + }, + { + "epoch": 0.6864864864864865, + "grad_norm": 2.394932270050049, + "learning_rate": 4.942990157209381e-06, + "loss": 0.6685, + "step": 127 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 2.61933970451355, + "learning_rate": 4.9420851720997674e-06, + "loss": 0.8812, + "step": 128 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 2.7395646572113037, + "learning_rate": 4.94117314465012e-06, + "loss": 1.3014, + "step": 129 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 3.065484046936035, + "learning_rate": 4.940254077490487e-06, + "loss": 0.6978, + "step": 130 + }, + { + "epoch": 0.7081081081081081, + "grad_norm": 2.895038366317749, + "learning_rate": 4.939327973271222e-06, + "loss": 0.6249, + "step": 131 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 3.1773312091827393, + "learning_rate": 4.9383948346629665e-06, + "loss": 0.6423, + "step": 132 + }, + { + "epoch": 0.7189189189189189, + "grad_norm": 2.2378008365631104, + "learning_rate": 4.937454664356652e-06, + "loss": 0.7193, + "step": 133 + }, + { + "epoch": 0.7243243243243244, + "grad_norm": 2.5673701763153076, + "learning_rate": 4.9365074650634855e-06, + "loss": 0.7065, + "step": 134 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 2.7348387241363525, + "learning_rate": 4.9355532395149445e-06, + "loss": 1.0046, + "step": 135 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 2.391741991043091, + "learning_rate": 4.9345919904627655e-06, + "loss": 0.6771, + "step": 136 + }, + { + "epoch": 0.7405405405405405, + "grad_norm": 2.2096705436706543, + "learning_rate": 4.933623720678944e-06, + "loss": 0.6589, + "step": 137 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 3.0840072631835938, + "learning_rate": 4.932648432955718e-06, + "loss": 0.8755, + "step": 138 + }, + { + "epoch": 0.7513513513513513, + "grad_norm": 2.4970428943634033, + "learning_rate": 4.931666130105564e-06, + "loss": 0.6685, + "step": 139 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 4.315455436706543, + "learning_rate": 4.930676814961189e-06, + "loss": 0.8101, + "step": 140 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 5.388065814971924, + "learning_rate": 4.92968049037552e-06, + "loss": 0.8193, + "step": 141 + }, + { + "epoch": 0.7675675675675676, + "grad_norm": 2.6107139587402344, + "learning_rate": 4.9286771592217005e-06, + "loss": 0.7852, + "step": 142 + }, + { + "epoch": 0.772972972972973, + "grad_norm": 3.936556577682495, + "learning_rate": 4.927666824393076e-06, + "loss": 1.0388, + "step": 143 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 2.74424409866333, + "learning_rate": 4.926649488803191e-06, + "loss": 0.8266, + "step": 144 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 2.8998451232910156, + "learning_rate": 4.925625155385776e-06, + "loss": 0.4895, + "step": 145 + }, + { + "epoch": 0.7891891891891892, + "grad_norm": 3.0631520748138428, + "learning_rate": 4.924593827094743e-06, + "loss": 0.8759, + "step": 146 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 3.233267307281494, + "learning_rate": 4.923555506904176e-06, + "loss": 0.701, + "step": 147 + }, + { + "epoch": 0.8, + "grad_norm": 2.87701416015625, + "learning_rate": 4.922510197808321e-06, + "loss": 1.1327, + "step": 148 + }, + { + "epoch": 0.8054054054054054, + "grad_norm": 3.650576114654541, + "learning_rate": 4.921457902821578e-06, + "loss": 0.7587, + "step": 149 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 3.232112407684326, + "learning_rate": 4.920398624978493e-06, + "loss": 1.2158, + "step": 150 + }, + { + "epoch": 0.8162162162162162, + "grad_norm": 2.468384027481079, + "learning_rate": 4.919332367333748e-06, + "loss": 0.6852, + "step": 151 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 2.5947415828704834, + "learning_rate": 4.918259132962154e-06, + "loss": 0.6611, + "step": 152 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 3.0171427726745605, + "learning_rate": 4.917178924958638e-06, + "loss": 0.7327, + "step": 153 + }, + { + "epoch": 0.8324324324324325, + "grad_norm": 3.293184518814087, + "learning_rate": 4.916091746438243e-06, + "loss": 0.8528, + "step": 154 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 4.0570969581604, + "learning_rate": 4.9149976005361085e-06, + "loss": 0.9141, + "step": 155 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 2.8782784938812256, + "learning_rate": 4.913896490407467e-06, + "loss": 1.1132, + "step": 156 + }, + { + "epoch": 0.8486486486486486, + "grad_norm": 2.5671517848968506, + "learning_rate": 4.912788419227635e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.8540540540540541, + "grad_norm": 2.9445390701293945, + "learning_rate": 4.911673390192002e-06, + "loss": 0.9227, + "step": 158 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 2.472595453262329, + "learning_rate": 4.910551406516023e-06, + "loss": 0.8154, + "step": 159 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 2.5233397483825684, + "learning_rate": 4.909422471435207e-06, + "loss": 0.9897, + "step": 160 + }, + { + "epoch": 0.8702702702702703, + "grad_norm": 3.3919546604156494, + "learning_rate": 4.90828658820511e-06, + "loss": 0.6162, + "step": 161 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 3.060908555984497, + "learning_rate": 4.907143760101325e-06, + "loss": 0.5734, + "step": 162 + }, + { + "epoch": 0.8810810810810811, + "grad_norm": 3.4584782123565674, + "learning_rate": 4.905993990419472e-06, + "loss": 0.8328, + "step": 163 + }, + { + "epoch": 0.8864864864864865, + "grad_norm": 2.936570644378662, + "learning_rate": 4.904837282475187e-06, + "loss": 0.6787, + "step": 164 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 2.564837694168091, + "learning_rate": 4.9036736396041165e-06, + "loss": 0.9658, + "step": 165 + }, + { + "epoch": 0.8972972972972973, + "grad_norm": 3.2509360313415527, + "learning_rate": 4.902503065161905e-06, + "loss": 0.7899, + "step": 166 + }, + { + "epoch": 0.9027027027027027, + "grad_norm": 2.9730329513549805, + "learning_rate": 4.901325562524185e-06, + "loss": 0.9476, + "step": 167 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 3.044980049133301, + "learning_rate": 4.900141135086569e-06, + "loss": 0.7589, + "step": 168 + }, + { + "epoch": 0.9135135135135135, + "grad_norm": 3.030585527420044, + "learning_rate": 4.898949786264638e-06, + "loss": 0.6724, + "step": 169 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 2.249122142791748, + "learning_rate": 4.897751519493933e-06, + "loss": 0.6968, + "step": 170 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 2.9816982746124268, + "learning_rate": 4.896546338229945e-06, + "loss": 0.7984, + "step": 171 + }, + { + "epoch": 0.9297297297297298, + "grad_norm": 2.415736675262451, + "learning_rate": 4.8953342459481034e-06, + "loss": 0.6109, + "step": 172 + }, + { + "epoch": 0.9351351351351351, + "grad_norm": 2.740518808364868, + "learning_rate": 4.894115246143768e-06, + "loss": 0.8126, + "step": 173 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 2.7610201835632324, + "learning_rate": 4.892889342332218e-06, + "loss": 0.6862, + "step": 174 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 3.057025194168091, + "learning_rate": 4.891656538048642e-06, + "loss": 0.9895, + "step": 175 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 2.569751262664795, + "learning_rate": 4.890416836848128e-06, + "loss": 0.8481, + "step": 176 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 2.4443397521972656, + "learning_rate": 4.889170242305652e-06, + "loss": 0.6478, + "step": 177 + }, + { + "epoch": 0.9621621621621622, + "grad_norm": 2.5009846687316895, + "learning_rate": 4.887916758016069e-06, + "loss": 0.9714, + "step": 178 + }, + { + "epoch": 0.9675675675675676, + "grad_norm": 3.101975202560425, + "learning_rate": 4.886656387594104e-06, + "loss": 1.1264, + "step": 179 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 2.6144704818725586, + "learning_rate": 4.885389134674338e-06, + "loss": 0.7664, + "step": 180 + }, + { + "epoch": 0.9783783783783784, + "grad_norm": 2.5834381580352783, + "learning_rate": 4.884115002911197e-06, + "loss": 0.6131, + "step": 181 + }, + { + "epoch": 0.9837837837837838, + "grad_norm": 2.5378055572509766, + "learning_rate": 4.88283399597895e-06, + "loss": 0.8733, + "step": 182 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 2.4095377922058105, + "learning_rate": 4.881546117571686e-06, + "loss": 0.643, + "step": 183 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 2.9554507732391357, + "learning_rate": 4.8802513714033135e-06, + "loss": 0.7287, + "step": 184 + }, + { + "epoch": 1.0, + "grad_norm": 2.8279213905334473, + "learning_rate": 4.878949761207545e-06, + "loss": 0.9927, + "step": 185 + }, + { + "epoch": 1.0054054054054054, + "grad_norm": 2.9361412525177, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.66, + "step": 186 + }, + { + "epoch": 1.0108108108108107, + "grad_norm": 3.392244338989258, + "learning_rate": 4.876325963767623e-06, + "loss": 0.594, + "step": 187 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 2.6276044845581055, + "learning_rate": 4.875003784089822e-06, + "loss": 0.5825, + "step": 188 + }, + { + "epoch": 1.0216216216216216, + "grad_norm": 2.2875545024871826, + "learning_rate": 4.873674755517305e-06, + "loss": 0.6594, + "step": 189 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 2.8086795806884766, + "learning_rate": 4.872338881882645e-06, + "loss": 0.7536, + "step": 190 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 2.3685200214385986, + "learning_rate": 4.870996167038154e-06, + "loss": 0.4849, + "step": 191 + }, + { + "epoch": 1.037837837837838, + "grad_norm": 3.0264766216278076, + "learning_rate": 4.869646614855877e-06, + "loss": 0.3771, + "step": 192 + }, + { + "epoch": 1.0432432432432432, + "grad_norm": 4.335122108459473, + "learning_rate": 4.868290229227567e-06, + "loss": 0.8545, + "step": 193 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 3.442172050476074, + "learning_rate": 4.866927014064692e-06, + "loss": 0.3698, + "step": 194 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 3.326539993286133, + "learning_rate": 4.86555697329841e-06, + "loss": 0.8468, + "step": 195 + }, + { + "epoch": 1.0594594594594595, + "grad_norm": 3.0372447967529297, + "learning_rate": 4.864180110879562e-06, + "loss": 0.8232, + "step": 196 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 2.955343008041382, + "learning_rate": 4.862796430778663e-06, + "loss": 0.4097, + "step": 197 + }, + { + "epoch": 1.0702702702702702, + "grad_norm": 2.4095399379730225, + "learning_rate": 4.861405936985889e-06, + "loss": 0.6746, + "step": 198 + }, + { + "epoch": 1.0756756756756758, + "grad_norm": 2.763500452041626, + "learning_rate": 4.860008633511059e-06, + "loss": 0.6605, + "step": 199 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 2.6751155853271484, + "learning_rate": 4.8586045243836384e-06, + "loss": 0.471, + "step": 200 + }, + { + "epoch": 1.0864864864864865, + "grad_norm": 3.3507862091064453, + "learning_rate": 4.857193613652711e-06, + "loss": 0.7665, + "step": 201 + }, + { + "epoch": 1.0918918918918918, + "grad_norm": 3.3064827919006348, + "learning_rate": 4.8557759053869775e-06, + "loss": 0.6436, + "step": 202 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 2.571828603744507, + "learning_rate": 4.854351403674741e-06, + "loss": 0.4642, + "step": 203 + }, + { + "epoch": 1.1027027027027028, + "grad_norm": 2.883220911026001, + "learning_rate": 4.852920112623895e-06, + "loss": 0.5737, + "step": 204 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 3.026144027709961, + "learning_rate": 4.851482036361912e-06, + "loss": 0.7302, + "step": 205 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 2.6689612865448, + "learning_rate": 4.850037179035829e-06, + "loss": 0.5229, + "step": 206 + }, + { + "epoch": 1.118918918918919, + "grad_norm": 2.4019956588745117, + "learning_rate": 4.8485855448122425e-06, + "loss": 0.5529, + "step": 207 + }, + { + "epoch": 1.1243243243243244, + "grad_norm": 2.3546230792999268, + "learning_rate": 4.847127137877286e-06, + "loss": 0.3635, + "step": 208 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 2.999096393585205, + "learning_rate": 4.8456619624366285e-06, + "loss": 0.8149, + "step": 209 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 10.072900772094727, + "learning_rate": 4.844190022715456e-06, + "loss": 0.8333, + "step": 210 + }, + { + "epoch": 1.1405405405405404, + "grad_norm": 2.222123384475708, + "learning_rate": 4.84271132295846e-06, + "loss": 0.3717, + "step": 211 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 2.8751113414764404, + "learning_rate": 4.841225867429826e-06, + "loss": 0.5994, + "step": 212 + }, + { + "epoch": 1.1513513513513514, + "grad_norm": 2.9580111503601074, + "learning_rate": 4.839733660413224e-06, + "loss": 0.8382, + "step": 213 + }, + { + "epoch": 1.1567567567567567, + "grad_norm": 4.628892421722412, + "learning_rate": 4.838234706211792e-06, + "loss": 0.818, + "step": 214 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 2.5103509426116943, + "learning_rate": 4.836729009148124e-06, + "loss": 0.4267, + "step": 215 + }, + { + "epoch": 1.1675675675675676, + "grad_norm": 2.6093738079071045, + "learning_rate": 4.835216573564261e-06, + "loss": 0.3472, + "step": 216 + }, + { + "epoch": 1.172972972972973, + "grad_norm": 3.0792338848114014, + "learning_rate": 4.833697403821672e-06, + "loss": 0.6323, + "step": 217 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 2.845163345336914, + "learning_rate": 4.8321715043012516e-06, + "loss": 0.6831, + "step": 218 + }, + { + "epoch": 1.1837837837837837, + "grad_norm": 3.0433948040008545, + "learning_rate": 4.830638879403296e-06, + "loss": 0.3682, + "step": 219 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 2.6533594131469727, + "learning_rate": 4.8290995335475e-06, + "loss": 0.4154, + "step": 220 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 2.9271352291107178, + "learning_rate": 4.827553471172935e-06, + "loss": 0.3991, + "step": 221 + }, + { + "epoch": 1.2, + "grad_norm": 2.9243528842926025, + "learning_rate": 4.826000696738045e-06, + "loss": 0.4538, + "step": 222 + }, + { + "epoch": 1.2054054054054055, + "grad_norm": 2.537332534790039, + "learning_rate": 4.824441214720629e-06, + "loss": 0.7692, + "step": 223 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 3.9193246364593506, + "learning_rate": 4.8228750296178275e-06, + "loss": 0.6038, + "step": 224 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 2.6646728515625, + "learning_rate": 4.821302145946113e-06, + "loss": 0.4147, + "step": 225 + }, + { + "epoch": 1.2216216216216216, + "grad_norm": 2.6519482135772705, + "learning_rate": 4.819722568241274e-06, + "loss": 0.5398, + "step": 226 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 2.2018048763275146, + "learning_rate": 4.818136301058401e-06, + "loss": 0.3864, + "step": 227 + }, + { + "epoch": 1.2324324324324325, + "grad_norm": 2.5660712718963623, + "learning_rate": 4.816543348971879e-06, + "loss": 0.5712, + "step": 228 + }, + { + "epoch": 1.2378378378378379, + "grad_norm": 3.237663745880127, + "learning_rate": 4.814943716575368e-06, + "loss": 0.662, + "step": 229 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 2.5570430755615234, + "learning_rate": 4.813337408481793e-06, + "loss": 0.8661, + "step": 230 + }, + { + "epoch": 1.2486486486486488, + "grad_norm": 2.9231269359588623, + "learning_rate": 4.811724429323329e-06, + "loss": 0.9218, + "step": 231 + }, + { + "epoch": 1.2540540540540541, + "grad_norm": 3.637084722518921, + "learning_rate": 4.810104783751389e-06, + "loss": 0.5597, + "step": 232 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 3.0218842029571533, + "learning_rate": 4.8084784764366125e-06, + "loss": 0.4786, + "step": 233 + }, + { + "epoch": 1.2648648648648648, + "grad_norm": 2.770214080810547, + "learning_rate": 4.806845512068846e-06, + "loss": 0.5219, + "step": 234 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 3.093053102493286, + "learning_rate": 4.805205895357137e-06, + "loss": 0.643, + "step": 235 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 2.6373348236083984, + "learning_rate": 4.803559631029713e-06, + "loss": 0.5858, + "step": 236 + }, + { + "epoch": 1.281081081081081, + "grad_norm": 2.452030897140503, + "learning_rate": 4.801906723833973e-06, + "loss": 0.4185, + "step": 237 + }, + { + "epoch": 1.2864864864864864, + "grad_norm": 2.72564697265625, + "learning_rate": 4.8002471785364734e-06, + "loss": 0.4917, + "step": 238 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 3.0389158725738525, + "learning_rate": 4.798580999922913e-06, + "loss": 0.645, + "step": 239 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 3.7002289295196533, + "learning_rate": 4.796908192798117e-06, + "loss": 0.5378, + "step": 240 + }, + { + "epoch": 1.3027027027027027, + "grad_norm": 2.1876111030578613, + "learning_rate": 4.7952287619860276e-06, + "loss": 0.5197, + "step": 241 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 3.903337240219116, + "learning_rate": 4.793542712329689e-06, + "loss": 1.0226, + "step": 242 + }, + { + "epoch": 1.3135135135135134, + "grad_norm": 2.3623552322387695, + "learning_rate": 4.791850048691228e-06, + "loss": 0.5502, + "step": 243 + }, + { + "epoch": 1.318918918918919, + "grad_norm": 3.0669031143188477, + "learning_rate": 4.79015077595185e-06, + "loss": 0.6976, + "step": 244 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 3.1480472087860107, + "learning_rate": 4.788444899011816e-06, + "loss": 0.4795, + "step": 245 + }, + { + "epoch": 1.3297297297297297, + "grad_norm": 3.7051920890808105, + "learning_rate": 4.786732422790432e-06, + "loss": 0.6526, + "step": 246 + }, + { + "epoch": 1.3351351351351353, + "grad_norm": 3.4358389377593994, + "learning_rate": 4.785013352226036e-06, + "loss": 0.5551, + "step": 247 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 2.3789355754852295, + "learning_rate": 4.7832876922759805e-06, + "loss": 0.3151, + "step": 248 + }, + { + "epoch": 1.345945945945946, + "grad_norm": 2.4843716621398926, + "learning_rate": 4.781555447916622e-06, + "loss": 0.6713, + "step": 249 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 3.0176303386688232, + "learning_rate": 4.779816624143302e-06, + "loss": 0.437, + "step": 250 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 2.868350028991699, + "learning_rate": 4.77807122597034e-06, + "loss": 0.7632, + "step": 251 + }, + { + "epoch": 1.3621621621621622, + "grad_norm": 2.4629738330841064, + "learning_rate": 4.776319258431009e-06, + "loss": 0.4894, + "step": 252 + }, + { + "epoch": 1.3675675675675676, + "grad_norm": 2.798297882080078, + "learning_rate": 4.77456072657753e-06, + "loss": 0.4456, + "step": 253 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 3.2977547645568848, + "learning_rate": 4.772795635481053e-06, + "loss": 0.5381, + "step": 254 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 4.1061906814575195, + "learning_rate": 4.77102399023164e-06, + "loss": 1.0302, + "step": 255 + }, + { + "epoch": 1.3837837837837839, + "grad_norm": 3.943284511566162, + "learning_rate": 4.769245795938261e-06, + "loss": 0.4875, + "step": 256 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 2.6420533657073975, + "learning_rate": 4.767461057728763e-06, + "loss": 0.4923, + "step": 257 + }, + { + "epoch": 1.3945945945945946, + "grad_norm": 3.3152263164520264, + "learning_rate": 4.76566978074987e-06, + "loss": 0.6699, + "step": 258 + }, + { + "epoch": 1.4, + "grad_norm": 2.6928882598876953, + "learning_rate": 4.7638719701671586e-06, + "loss": 0.6117, + "step": 259 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 2.706597328186035, + "learning_rate": 4.762067631165049e-06, + "loss": 0.8534, + "step": 260 + }, + { + "epoch": 1.4108108108108108, + "grad_norm": 2.9912848472595215, + "learning_rate": 4.760256768946787e-06, + "loss": 0.5057, + "step": 261 + }, + { + "epoch": 1.4162162162162162, + "grad_norm": 2.7098443508148193, + "learning_rate": 4.758439388734429e-06, + "loss": 0.7286, + "step": 262 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 3.1288092136383057, + "learning_rate": 4.7566154957688276e-06, + "loss": 0.9827, + "step": 263 + }, + { + "epoch": 1.427027027027027, + "grad_norm": 3.0505919456481934, + "learning_rate": 4.754785095309617e-06, + "loss": 0.7042, + "step": 264 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 2.6800339221954346, + "learning_rate": 4.752948192635199e-06, + "loss": 0.5179, + "step": 265 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 2.2246861457824707, + "learning_rate": 4.751104793042722e-06, + "loss": 0.8527, + "step": 266 + }, + { + "epoch": 1.4432432432432432, + "grad_norm": 2.4242751598358154, + "learning_rate": 4.7492549018480725e-06, + "loss": 0.5627, + "step": 267 + }, + { + "epoch": 1.4486486486486487, + "grad_norm": 2.763244152069092, + "learning_rate": 4.747398524385858e-06, + "loss": 0.8981, + "step": 268 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 2.856595993041992, + "learning_rate": 4.745535666009389e-06, + "loss": 0.5455, + "step": 269 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 2.4168624877929688, + "learning_rate": 4.743666332090664e-06, + "loss": 0.4348, + "step": 270 + }, + { + "epoch": 1.464864864864865, + "grad_norm": 2.5408060550689697, + "learning_rate": 4.74179052802036e-06, + "loss": 0.5524, + "step": 271 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 2.6216673851013184, + "learning_rate": 4.739908259207807e-06, + "loss": 0.7469, + "step": 272 + }, + { + "epoch": 1.4756756756756757, + "grad_norm": 5.397300720214844, + "learning_rate": 4.738019531080981e-06, + "loss": 0.7216, + "step": 273 + }, + { + "epoch": 1.481081081081081, + "grad_norm": 3.3481080532073975, + "learning_rate": 4.7361243490864825e-06, + "loss": 0.7527, + "step": 274 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 2.7943873405456543, + "learning_rate": 4.734222718689527e-06, + "loss": 0.7437, + "step": 275 + }, + { + "epoch": 1.491891891891892, + "grad_norm": 2.206890344619751, + "learning_rate": 4.732314645373922e-06, + "loss": 0.5187, + "step": 276 + }, + { + "epoch": 1.4972972972972973, + "grad_norm": 2.76442813873291, + "learning_rate": 4.730400134642055e-06, + "loss": 0.7186, + "step": 277 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 3.4754087924957275, + "learning_rate": 4.728479192014879e-06, + "loss": 0.9655, + "step": 278 + }, + { + "epoch": 1.5081081081081082, + "grad_norm": 2.923779249191284, + "learning_rate": 4.726551823031895e-06, + "loss": 0.6251, + "step": 279 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 3.1142773628234863, + "learning_rate": 4.7246180332511335e-06, + "loss": 0.4805, + "step": 280 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 2.3477070331573486, + "learning_rate": 4.722677828249142e-06, + "loss": 1.0939, + "step": 281 + }, + { + "epoch": 1.5243243243243243, + "grad_norm": 2.8418569564819336, + "learning_rate": 4.720731213620972e-06, + "loss": 0.9485, + "step": 282 + }, + { + "epoch": 1.5297297297297296, + "grad_norm": 2.462710380554199, + "learning_rate": 4.718778194980152e-06, + "loss": 0.5805, + "step": 283 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 3.2379209995269775, + "learning_rate": 4.7168187779586805e-06, + "loss": 0.77, + "step": 284 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 3.0701661109924316, + "learning_rate": 4.71485296820701e-06, + "loss": 0.5932, + "step": 285 + }, + { + "epoch": 1.545945945945946, + "grad_norm": 4.099547386169434, + "learning_rate": 4.7128807713940245e-06, + "loss": 0.6296, + "step": 286 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 2.5529167652130127, + "learning_rate": 4.710902193207028e-06, + "loss": 0.6201, + "step": 287 + }, + { + "epoch": 1.5567567567567568, + "grad_norm": 2.794926881790161, + "learning_rate": 4.708917239351727e-06, + "loss": 0.5682, + "step": 288 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 3.2522501945495605, + "learning_rate": 4.706925915552214e-06, + "loss": 0.8877, + "step": 289 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 2.811847448348999, + "learning_rate": 4.704928227550949e-06, + "loss": 0.6521, + "step": 290 + }, + { + "epoch": 1.572972972972973, + "grad_norm": 2.7060673236846924, + "learning_rate": 4.702924181108745e-06, + "loss": 0.4929, + "step": 291 + }, + { + "epoch": 1.5783783783783782, + "grad_norm": 2.5009031295776367, + "learning_rate": 4.700913782004755e-06, + "loss": 0.4515, + "step": 292 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 2.6722700595855713, + "learning_rate": 4.698897036036446e-06, + "loss": 0.5477, + "step": 293 + }, + { + "epoch": 1.5891891891891892, + "grad_norm": 3.3333957195281982, + "learning_rate": 4.696873949019591e-06, + "loss": 0.9589, + "step": 294 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 2.4862897396087646, + "learning_rate": 4.694844526788248e-06, + "loss": 0.4425, + "step": 295 + }, + { + "epoch": 1.6, + "grad_norm": 2.78708553314209, + "learning_rate": 4.692808775194745e-06, + "loss": 0.4899, + "step": 296 + }, + { + "epoch": 1.6054054054054054, + "grad_norm": 2.9121289253234863, + "learning_rate": 4.690766700109659e-06, + "loss": 0.4884, + "step": 297 + }, + { + "epoch": 1.6108108108108108, + "grad_norm": 4.692054271697998, + "learning_rate": 4.688718307421807e-06, + "loss": 0.8977, + "step": 298 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 3.1290926933288574, + "learning_rate": 4.686663603038222e-06, + "loss": 0.6833, + "step": 299 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 3.5091123580932617, + "learning_rate": 4.6846025928841365e-06, + "loss": 0.9141, + "step": 300 + }, + { + "epoch": 1.627027027027027, + "grad_norm": 2.5466184616088867, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.5121, + "step": 301 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 2.7833092212677, + "learning_rate": 4.68046167905631e-06, + "loss": 0.5399, + "step": 302 + }, + { + "epoch": 1.637837837837838, + "grad_norm": 3.05135440826416, + "learning_rate": 4.678381787323889e-06, + "loss": 0.7921, + "step": 303 + }, + { + "epoch": 1.6432432432432433, + "grad_norm": 2.2391726970672607, + "learning_rate": 4.676295613703577e-06, + "loss": 0.7178, + "step": 304 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 2.3654022216796875, + "learning_rate": 4.674203164211357e-06, + "loss": 0.7162, + "step": 305 + }, + { + "epoch": 1.654054054054054, + "grad_norm": 2.436009645462036, + "learning_rate": 4.67210444488131e-06, + "loss": 0.6539, + "step": 306 + }, + { + "epoch": 1.6594594594594594, + "grad_norm": 2.6034209728240967, + "learning_rate": 4.669999461765599e-06, + "loss": 0.7214, + "step": 307 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 2.804229497909546, + "learning_rate": 4.6678882209344474e-06, + "loss": 0.7451, + "step": 308 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 2.6239655017852783, + "learning_rate": 4.665770728476127e-06, + "loss": 0.6464, + "step": 309 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 2.9320099353790283, + "learning_rate": 4.663646990496939e-06, + "loss": 0.6669, + "step": 310 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 3.09713077545166, + "learning_rate": 4.661517013121189e-06, + "loss": 0.8972, + "step": 311 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 3.6576132774353027, + "learning_rate": 4.659380802491181e-06, + "loss": 0.6286, + "step": 312 + }, + { + "epoch": 1.691891891891892, + "grad_norm": 2.9320433139801025, + "learning_rate": 4.6572383647671915e-06, + "loss": 0.3631, + "step": 313 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 3.399357557296753, + "learning_rate": 4.655089706127457e-06, + "loss": 0.5682, + "step": 314 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 2.7667412757873535, + "learning_rate": 4.652934832768148e-06, + "loss": 0.5457, + "step": 315 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 2.3023321628570557, + "learning_rate": 4.650773750903363e-06, + "loss": 0.6601, + "step": 316 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 2.6584670543670654, + "learning_rate": 4.6486064667651005e-06, + "loss": 0.5882, + "step": 317 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 5.528168678283691, + "learning_rate": 4.646432986603245e-06, + "loss": 0.7628, + "step": 318 + }, + { + "epoch": 1.7243243243243245, + "grad_norm": 3.054884195327759, + "learning_rate": 4.644253316685552e-06, + "loss": 0.6877, + "step": 319 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 3.2672388553619385, + "learning_rate": 4.6420674632976205e-06, + "loss": 0.7026, + "step": 320 + }, + { + "epoch": 1.7351351351351352, + "grad_norm": 3.109384536743164, + "learning_rate": 4.639875432742886e-06, + "loss": 0.5236, + "step": 321 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 3.3593883514404297, + "learning_rate": 4.6376772313425975e-06, + "loss": 0.6463, + "step": 322 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 2.6352698802948, + "learning_rate": 4.635472865435795e-06, + "loss": 0.6903, + "step": 323 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 2.751690149307251, + "learning_rate": 4.6332623413792995e-06, + "loss": 0.7342, + "step": 324 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 2.670915126800537, + "learning_rate": 4.6310456655476874e-06, + "loss": 0.4302, + "step": 325 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 2.7648138999938965, + "learning_rate": 4.6288228443332786e-06, + "loss": 0.5108, + "step": 326 + }, + { + "epoch": 1.7675675675675677, + "grad_norm": 2.7451536655426025, + "learning_rate": 4.626593884146111e-06, + "loss": 0.7646, + "step": 327 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 2.4656403064727783, + "learning_rate": 4.624358791413928e-06, + "loss": 0.5529, + "step": 328 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 2.5987517833709717, + "learning_rate": 4.622117572582159e-06, + "loss": 0.609, + "step": 329 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 3.3843371868133545, + "learning_rate": 4.619870234113894e-06, + "loss": 0.9146, + "step": 330 + }, + { + "epoch": 1.7891891891891891, + "grad_norm": 2.3542068004608154, + "learning_rate": 4.617616782489878e-06, + "loss": 0.6887, + "step": 331 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 2.2049715518951416, + "learning_rate": 4.615357224208477e-06, + "loss": 0.505, + "step": 332 + }, + { + "epoch": 1.8, + "grad_norm": 2.453920364379883, + "learning_rate": 4.613091565785674e-06, + "loss": 0.8384, + "step": 333 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 2.5751583576202393, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5512, + "step": 334 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 2.524075984954834, + "learning_rate": 4.608541974667714e-06, + "loss": 0.4877, + "step": 335 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 2.2856955528259277, + "learning_rate": 4.606258055092397e-06, + "loss": 0.5583, + "step": 336 + }, + { + "epoch": 1.8216216216216217, + "grad_norm": 2.2773683071136475, + "learning_rate": 4.603968061615321e-06, + "loss": 0.5421, + "step": 337 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 4.085512161254883, + "learning_rate": 4.601672000840231e-06, + "loss": 0.942, + "step": 338 + }, + { + "epoch": 1.8324324324324324, + "grad_norm": 2.3710968494415283, + "learning_rate": 4.5993698793883715e-06, + "loss": 0.3773, + "step": 339 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 2.745534658432007, + "learning_rate": 4.597061703898462e-06, + "loss": 0.9694, + "step": 340 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 2.463207244873047, + "learning_rate": 4.594747481026685e-06, + "loss": 0.4667, + "step": 341 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 2.7216601371765137, + "learning_rate": 4.592427217446656e-06, + "loss": 0.4267, + "step": 342 + }, + { + "epoch": 1.8540540540540542, + "grad_norm": 2.545664072036743, + "learning_rate": 4.590100919849413e-06, + "loss": 0.9245, + "step": 343 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 3.692840337753296, + "learning_rate": 4.587768594943396e-06, + "loss": 0.7502, + "step": 344 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 2.993229627609253, + "learning_rate": 4.585430249454426e-06, + "loss": 0.4689, + "step": 345 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 2.162867546081543, + "learning_rate": 4.583085890125682e-06, + "loss": 0.6188, + "step": 346 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 2.2169792652130127, + "learning_rate": 4.5807355237176896e-06, + "loss": 0.6352, + "step": 347 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 3.978985548019409, + "learning_rate": 4.578379157008296e-06, + "loss": 0.464, + "step": 348 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 2.236682653427124, + "learning_rate": 4.57601679679265e-06, + "loss": 0.5943, + "step": 349 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 2.528754472732544, + "learning_rate": 4.573648449883188e-06, + "loss": 0.6949, + "step": 350 + }, + { + "epoch": 1.8972972972972975, + "grad_norm": 2.7673721313476562, + "learning_rate": 4.571274123109606e-06, + "loss": 0.4333, + "step": 351 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 2.698012351989746, + "learning_rate": 4.568893823318847e-06, + "loss": 0.6796, + "step": 352 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 2.9640560150146484, + "learning_rate": 4.566507557375077e-06, + "loss": 0.6139, + "step": 353 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 2.417628526687622, + "learning_rate": 4.5641153321596684e-06, + "loss": 0.4515, + "step": 354 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 2.676739454269409, + "learning_rate": 4.56171715457118e-06, + "loss": 0.8426, + "step": 355 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 2.8428189754486084, + "learning_rate": 4.559313031525331e-06, + "loss": 0.5806, + "step": 356 + }, + { + "epoch": 1.9297297297297298, + "grad_norm": 2.6817944049835205, + "learning_rate": 4.55690296995499e-06, + "loss": 0.5927, + "step": 357 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 3.5939931869506836, + "learning_rate": 4.554486976810149e-06, + "loss": 0.9986, + "step": 358 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 2.86688494682312, + "learning_rate": 4.552065059057906e-06, + "loss": 0.6813, + "step": 359 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 2.9295246601104736, + "learning_rate": 4.549637223682441e-06, + "loss": 1.0832, + "step": 360 + }, + { + "epoch": 1.9513513513513514, + "grad_norm": 2.6939451694488525, + "learning_rate": 4.547203477685005e-06, + "loss": 0.7377, + "step": 361 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 2.226055145263672, + "learning_rate": 4.544763828083888e-06, + "loss": 0.5412, + "step": 362 + }, + { + "epoch": 1.962162162162162, + "grad_norm": 2.490187406539917, + "learning_rate": 4.542318281914405e-06, + "loss": 0.6955, + "step": 363 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 2.9241302013397217, + "learning_rate": 4.53986684622888e-06, + "loss": 0.6774, + "step": 364 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 2.988084554672241, + "learning_rate": 4.537409528096615e-06, + "loss": 0.5832, + "step": 365 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 2.9380626678466797, + "learning_rate": 4.534946334603879e-06, + "loss": 0.606, + "step": 366 + }, + { + "epoch": 1.983783783783784, + "grad_norm": 2.667588710784912, + "learning_rate": 4.532477272853882e-06, + "loss": 0.4991, + "step": 367 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 2.9711899757385254, + "learning_rate": 4.530002349966759e-06, + "loss": 0.4442, + "step": 368 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 3.443957805633545, + "learning_rate": 4.5275215730795445e-06, + "loss": 0.6566, + "step": 369 + }, + { + "epoch": 2.0, + "grad_norm": 3.590317487716675, + "learning_rate": 4.525034949346156e-06, + "loss": 0.5687, + "step": 370 + }, + { + "epoch": 2.0054054054054054, + "grad_norm": 3.678600549697876, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4458, + "step": 371 + }, + { + "epoch": 2.0108108108108107, + "grad_norm": 3.803563356399536, + "learning_rate": 4.5200441900408045e-06, + "loss": 0.4418, + "step": 372 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 2.9187233448028564, + "learning_rate": 4.517540068860898e-06, + "loss": 0.7057, + "step": 373 + }, + { + "epoch": 2.0216216216216214, + "grad_norm": 2.693603515625, + "learning_rate": 4.515030129618884e-06, + "loss": 0.4491, + "step": 374 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 2.3883047103881836, + "learning_rate": 4.512514379552779e-06, + "loss": 0.3571, + "step": 375 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 4.558557033538818, + "learning_rate": 4.509992825917352e-06, + "loss": 0.5056, + "step": 376 + }, + { + "epoch": 2.037837837837838, + "grad_norm": 3.9574761390686035, + "learning_rate": 4.507465475984109e-06, + "loss": 0.6834, + "step": 377 + }, + { + "epoch": 2.0432432432432432, + "grad_norm": 5.34630012512207, + "learning_rate": 4.504932337041272e-06, + "loss": 0.6726, + "step": 378 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 3.198740243911743, + "learning_rate": 4.502393416393757e-06, + "loss": 0.4032, + "step": 379 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 3.347480297088623, + "learning_rate": 4.4998487213631515e-06, + "loss": 0.5442, + "step": 380 + }, + { + "epoch": 2.0594594594594593, + "grad_norm": 3.940531015396118, + "learning_rate": 4.497298259287696e-06, + "loss": 0.6181, + "step": 381 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 3.0910496711730957, + "learning_rate": 4.494742037522261e-06, + "loss": 0.3829, + "step": 382 + }, + { + "epoch": 2.0702702702702704, + "grad_norm": 4.060451984405518, + "learning_rate": 4.4921800634383295e-06, + "loss": 0.4953, + "step": 383 + }, + { + "epoch": 2.075675675675676, + "grad_norm": 3.1667511463165283, + "learning_rate": 4.4896123444239655e-06, + "loss": 0.3254, + "step": 384 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 3.0239670276641846, + "learning_rate": 4.487038887883809e-06, + "loss": 0.555, + "step": 385 + }, + { + "epoch": 2.0864864864864865, + "grad_norm": 2.8815383911132812, + "learning_rate": 4.484459701239038e-06, + "loss": 0.665, + "step": 386 + }, + { + "epoch": 2.091891891891892, + "grad_norm": 3.615537166595459, + "learning_rate": 4.481874791927358e-06, + "loss": 0.2652, + "step": 387 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 3.407407283782959, + "learning_rate": 4.479284167402977e-06, + "loss": 0.3811, + "step": 388 + }, + { + "epoch": 2.1027027027027025, + "grad_norm": 2.6651623249053955, + "learning_rate": 4.476687835136585e-06, + "loss": 0.2463, + "step": 389 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 3.5145862102508545, + "learning_rate": 4.47408580261533e-06, + "loss": 0.5507, + "step": 390 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 3.0952725410461426, + "learning_rate": 4.471478077342798e-06, + "loss": 0.288, + "step": 391 + }, + { + "epoch": 2.118918918918919, + "grad_norm": 2.634775400161743, + "learning_rate": 4.468864666838994e-06, + "loss": 0.5169, + "step": 392 + }, + { + "epoch": 2.1243243243243244, + "grad_norm": 3.7388594150543213, + "learning_rate": 4.4662455786403125e-06, + "loss": 0.3327, + "step": 393 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 3.8197360038757324, + "learning_rate": 4.463620820299528e-06, + "loss": 0.3877, + "step": 394 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 3.0073485374450684, + "learning_rate": 4.4609903993857606e-06, + "loss": 0.5425, + "step": 395 + }, + { + "epoch": 2.1405405405405404, + "grad_norm": 2.6923868656158447, + "learning_rate": 4.458354323484462e-06, + "loss": 0.5257, + "step": 396 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 3.2151331901550293, + "learning_rate": 4.45571260019739e-06, + "loss": 0.3914, + "step": 397 + }, + { + "epoch": 2.1513513513513516, + "grad_norm": 3.4031248092651367, + "learning_rate": 4.453065237142592e-06, + "loss": 0.3455, + "step": 398 + }, + { + "epoch": 2.156756756756757, + "grad_norm": 3.012275457382202, + "learning_rate": 4.4504122419543745e-06, + "loss": 0.4652, + "step": 399 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 3.3084208965301514, + "learning_rate": 4.4477536222832865e-06, + "loss": 0.6343, + "step": 400 + }, + { + "epoch": 2.1675675675675676, + "grad_norm": 3.115206241607666, + "learning_rate": 4.445089385796099e-06, + "loss": 0.6975, + "step": 401 + }, + { + "epoch": 2.172972972972973, + "grad_norm": 2.893930435180664, + "learning_rate": 4.442419540175778e-06, + "loss": 0.5779, + "step": 402 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 3.0549168586730957, + "learning_rate": 4.439744093121465e-06, + "loss": 0.4541, + "step": 403 + }, + { + "epoch": 2.1837837837837837, + "grad_norm": 3.1189024448394775, + "learning_rate": 4.437063052348457e-06, + "loss": 0.4078, + "step": 404 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 6.644659042358398, + "learning_rate": 4.434376425588179e-06, + "loss": 0.6759, + "step": 405 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 2.807554006576538, + "learning_rate": 4.431684220588163e-06, + "loss": 0.2938, + "step": 406 + }, + { + "epoch": 2.2, + "grad_norm": 3.6900999546051025, + "learning_rate": 4.428986445112034e-06, + "loss": 0.676, + "step": 407 + }, + { + "epoch": 2.2054054054054055, + "grad_norm": 2.0721664428710938, + "learning_rate": 4.426283106939474e-06, + "loss": 0.1859, + "step": 408 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 2.953388214111328, + "learning_rate": 4.423574213866209e-06, + "loss": 0.2955, + "step": 409 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 3.049050807952881, + "learning_rate": 4.420859773703985e-06, + "loss": 0.2262, + "step": 410 + }, + { + "epoch": 2.2216216216216216, + "grad_norm": 3.319796323776245, + "learning_rate": 4.418139794280542e-06, + "loss": 0.2273, + "step": 411 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 2.4133522510528564, + "learning_rate": 4.415414283439595e-06, + "loss": 0.3282, + "step": 412 + }, + { + "epoch": 2.2324324324324323, + "grad_norm": 2.9842193126678467, + "learning_rate": 4.4126832490408116e-06, + "loss": 0.3651, + "step": 413 + }, + { + "epoch": 2.237837837837838, + "grad_norm": 2.759531259536743, + "learning_rate": 4.409946698959784e-06, + "loss": 0.4052, + "step": 414 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 3.045485019683838, + "learning_rate": 4.4072046410880145e-06, + "loss": 0.4638, + "step": 415 + }, + { + "epoch": 2.2486486486486488, + "grad_norm": 3.0058295726776123, + "learning_rate": 4.404457083332887e-06, + "loss": 0.517, + "step": 416 + }, + { + "epoch": 2.254054054054054, + "grad_norm": 3.025688409805298, + "learning_rate": 4.401704033617643e-06, + "loss": 0.6902, + "step": 417 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 3.3047802448272705, + "learning_rate": 4.398945499881366e-06, + "loss": 0.3552, + "step": 418 + }, + { + "epoch": 2.264864864864865, + "grad_norm": 3.0683655738830566, + "learning_rate": 4.396181490078949e-06, + "loss": 0.286, + "step": 419 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 3.627681016921997, + "learning_rate": 4.393412012181082e-06, + "loss": 0.4036, + "step": 420 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 4.552238941192627, + "learning_rate": 4.390637074174219e-06, + "loss": 0.8037, + "step": 421 + }, + { + "epoch": 2.281081081081081, + "grad_norm": 2.8688855171203613, + "learning_rate": 4.387856684060561e-06, + "loss": 0.2553, + "step": 422 + }, + { + "epoch": 2.2864864864864867, + "grad_norm": 4.21850061416626, + "learning_rate": 4.385070849858033e-06, + "loss": 0.6222, + "step": 423 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 3.038433790206909, + "learning_rate": 4.382279579600257e-06, + "loss": 0.5326, + "step": 424 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 3.297300338745117, + "learning_rate": 4.379482881336532e-06, + "loss": 0.5515, + "step": 425 + }, + { + "epoch": 2.3027027027027027, + "grad_norm": 7.162952423095703, + "learning_rate": 4.376680763131811e-06, + "loss": 0.6948, + "step": 426 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 3.2403595447540283, + "learning_rate": 4.373873233066676e-06, + "loss": 0.2947, + "step": 427 + }, + { + "epoch": 2.3135135135135134, + "grad_norm": 3.2969906330108643, + "learning_rate": 4.371060299237315e-06, + "loss": 0.2261, + "step": 428 + }, + { + "epoch": 2.3189189189189188, + "grad_norm": 2.669058322906494, + "learning_rate": 4.368241969755499e-06, + "loss": 0.5398, + "step": 429 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 2.7643518447875977, + "learning_rate": 4.36541825274856e-06, + "loss": 0.3301, + "step": 430 + }, + { + "epoch": 2.32972972972973, + "grad_norm": 3.6037657260894775, + "learning_rate": 4.3625891563593635e-06, + "loss": 0.6064, + "step": 431 + }, + { + "epoch": 2.3351351351351353, + "grad_norm": 2.8805618286132812, + "learning_rate": 4.35975468874629e-06, + "loss": 0.3897, + "step": 432 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 2.642402172088623, + "learning_rate": 4.356914858083211e-06, + "loss": 0.271, + "step": 433 + }, + { + "epoch": 2.345945945945946, + "grad_norm": 2.916337490081787, + "learning_rate": 4.354069672559458e-06, + "loss": 0.3681, + "step": 434 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 3.3312325477600098, + "learning_rate": 4.35121914037981e-06, + "loss": 0.298, + "step": 435 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 2.980583906173706, + "learning_rate": 4.348363269764462e-06, + "loss": 0.3618, + "step": 436 + }, + { + "epoch": 2.362162162162162, + "grad_norm": 3.5010197162628174, + "learning_rate": 4.345502068949003e-06, + "loss": 0.8972, + "step": 437 + }, + { + "epoch": 2.3675675675675674, + "grad_norm": 2.7187814712524414, + "learning_rate": 4.342635546184394e-06, + "loss": 0.3939, + "step": 438 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 2.8368170261383057, + "learning_rate": 4.339763709736944e-06, + "loss": 0.5462, + "step": 439 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 2.6989636421203613, + "learning_rate": 4.336886567888283e-06, + "loss": 0.5932, + "step": 440 + }, + { + "epoch": 2.383783783783784, + "grad_norm": 3.2514829635620117, + "learning_rate": 4.334004128935342e-06, + "loss": 0.4622, + "step": 441 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 5.242766857147217, + "learning_rate": 4.331116401190327e-06, + "loss": 0.5997, + "step": 442 + }, + { + "epoch": 2.3945945945945946, + "grad_norm": 3.492724657058716, + "learning_rate": 4.328223392980696e-06, + "loss": 0.3072, + "step": 443 + }, + { + "epoch": 2.4, + "grad_norm": 4.074132442474365, + "learning_rate": 4.325325112649134e-06, + "loss": 0.5338, + "step": 444 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 2.7208468914031982, + "learning_rate": 4.322421568553529e-06, + "loss": 0.3266, + "step": 445 + }, + { + "epoch": 2.410810810810811, + "grad_norm": 2.929180383682251, + "learning_rate": 4.3195127690669494e-06, + "loss": 0.4064, + "step": 446 + }, + { + "epoch": 2.4162162162162164, + "grad_norm": 2.848353624343872, + "learning_rate": 4.3165987225776186e-06, + "loss": 0.3856, + "step": 447 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 3.946488618850708, + "learning_rate": 4.313679437488889e-06, + "loss": 0.4261, + "step": 448 + }, + { + "epoch": 2.427027027027027, + "grad_norm": 5.781888961791992, + "learning_rate": 4.310754922219223e-06, + "loss": 0.4943, + "step": 449 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 2.8406941890716553, + "learning_rate": 4.307825185202164e-06, + "loss": 0.2874, + "step": 450 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 3.2017335891723633, + "learning_rate": 4.3048902348863116e-06, + "loss": 0.4218, + "step": 451 + }, + { + "epoch": 2.443243243243243, + "grad_norm": 3.8355906009674072, + "learning_rate": 4.301950079735303e-06, + "loss": 0.4204, + "step": 452 + }, + { + "epoch": 2.4486486486486485, + "grad_norm": 4.783357620239258, + "learning_rate": 4.299004728227782e-06, + "loss": 0.5593, + "step": 453 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 3.014080762863159, + "learning_rate": 4.2960541888573774e-06, + "loss": 0.4187, + "step": 454 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 3.5906598567962646, + "learning_rate": 4.29309847013268e-06, + "loss": 0.4193, + "step": 455 + }, + { + "epoch": 2.464864864864865, + "grad_norm": 3.9043331146240234, + "learning_rate": 4.290137580577216e-06, + "loss": 0.7035, + "step": 456 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 3.139753580093384, + "learning_rate": 4.287171528729423e-06, + "loss": 0.5877, + "step": 457 + }, + { + "epoch": 2.4756756756756757, + "grad_norm": 2.9091074466705322, + "learning_rate": 4.284200323142623e-06, + "loss": 0.5309, + "step": 458 + }, + { + "epoch": 2.481081081081081, + "grad_norm": 3.1253795623779297, + "learning_rate": 4.281223972385004e-06, + "loss": 0.448, + "step": 459 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 2.65510892868042, + "learning_rate": 4.27824248503959e-06, + "loss": 0.4453, + "step": 460 + }, + { + "epoch": 2.4918918918918918, + "grad_norm": 3.2135510444641113, + "learning_rate": 4.275255869704214e-06, + "loss": 0.5582, + "step": 461 + }, + { + "epoch": 2.4972972972972975, + "grad_norm": 2.452545404434204, + "learning_rate": 4.272264134991503e-06, + "loss": 0.423, + "step": 462 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 2.6370208263397217, + "learning_rate": 4.269267289528843e-06, + "loss": 0.271, + "step": 463 + }, + { + "epoch": 2.5081081081081082, + "grad_norm": 3.31266450881958, + "learning_rate": 4.266265341958356e-06, + "loss": 0.6459, + "step": 464 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 3.2743148803710938, + "learning_rate": 4.263258300936882e-06, + "loss": 0.2959, + "step": 465 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 2.883549690246582, + "learning_rate": 4.260246175135948e-06, + "loss": 0.3418, + "step": 466 + }, + { + "epoch": 2.5243243243243243, + "grad_norm": 2.7019498348236084, + "learning_rate": 4.257228973241742e-06, + "loss": 0.3459, + "step": 467 + }, + { + "epoch": 2.5297297297297296, + "grad_norm": 3.8166959285736084, + "learning_rate": 4.254206703955092e-06, + "loss": 0.4769, + "step": 468 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 3.264763593673706, + "learning_rate": 4.251179375991438e-06, + "loss": 0.6487, + "step": 469 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 2.7936933040618896, + "learning_rate": 4.248146998080808e-06, + "loss": 0.5547, + "step": 470 + }, + { + "epoch": 2.545945945945946, + "grad_norm": 3.21852707862854, + "learning_rate": 4.2451095789677945e-06, + "loss": 0.2965, + "step": 471 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 3.4528985023498535, + "learning_rate": 4.242067127411525e-06, + "loss": 0.3831, + "step": 472 + }, + { + "epoch": 2.556756756756757, + "grad_norm": 4.317023754119873, + "learning_rate": 4.239019652185642e-06, + "loss": 0.1756, + "step": 473 + }, + { + "epoch": 2.562162162162162, + "grad_norm": 3.677452325820923, + "learning_rate": 4.2359671620782725e-06, + "loss": 0.5136, + "step": 474 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 3.7563393115997314, + "learning_rate": 4.232909665892005e-06, + "loss": 0.6554, + "step": 475 + }, + { + "epoch": 2.572972972972973, + "grad_norm": 3.5125508308410645, + "learning_rate": 4.229847172443866e-06, + "loss": 0.3804, + "step": 476 + }, + { + "epoch": 2.5783783783783782, + "grad_norm": 2.8835806846618652, + "learning_rate": 4.2267796905652926e-06, + "loss": 0.3338, + "step": 477 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 3.2136261463165283, + "learning_rate": 4.223707229102105e-06, + "loss": 0.6163, + "step": 478 + }, + { + "epoch": 2.589189189189189, + "grad_norm": 3.467475175857544, + "learning_rate": 4.220629796914487e-06, + "loss": 0.3005, + "step": 479 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 3.597490072250366, + "learning_rate": 4.217547402876954e-06, + "loss": 0.56, + "step": 480 + }, + { + "epoch": 2.6, + "grad_norm": 3.2377140522003174, + "learning_rate": 4.214460055878329e-06, + "loss": 0.4512, + "step": 481 + }, + { + "epoch": 2.6054054054054054, + "grad_norm": 2.577746868133545, + "learning_rate": 4.211367764821722e-06, + "loss": 0.3074, + "step": 482 + }, + { + "epoch": 2.610810810810811, + "grad_norm": 3.6584155559539795, + "learning_rate": 4.208270538624497e-06, + "loss": 0.6752, + "step": 483 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 2.602778434753418, + "learning_rate": 4.205168386218251e-06, + "loss": 0.2347, + "step": 484 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 3.587503433227539, + "learning_rate": 4.2020613165487865e-06, + "loss": 0.5189, + "step": 485 + }, + { + "epoch": 2.627027027027027, + "grad_norm": 3.9341986179351807, + "learning_rate": 4.198949338576086e-06, + "loss": 0.7739, + "step": 486 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 2.9211957454681396, + "learning_rate": 4.1958324612742875e-06, + "loss": 0.3495, + "step": 487 + }, + { + "epoch": 2.637837837837838, + "grad_norm": 3.29193115234375, + "learning_rate": 4.1927106936316564e-06, + "loss": 0.2257, + "step": 488 + }, + { + "epoch": 2.6432432432432433, + "grad_norm": 3.3687057495117188, + "learning_rate": 4.189584044650559e-06, + "loss": 0.6708, + "step": 489 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 3.096428155899048, + "learning_rate": 4.186452523347441e-06, + "loss": 0.3126, + "step": 490 + }, + { + "epoch": 2.654054054054054, + "grad_norm": 3.0865559577941895, + "learning_rate": 4.183316138752799e-06, + "loss": 0.4219, + "step": 491 + }, + { + "epoch": 2.6594594594594594, + "grad_norm": 3.389827013015747, + "learning_rate": 4.180174899911149e-06, + "loss": 0.3937, + "step": 492 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 3.044360637664795, + "learning_rate": 4.177028815881012e-06, + "loss": 0.4098, + "step": 493 + }, + { + "epoch": 2.6702702702702705, + "grad_norm": 2.813094139099121, + "learning_rate": 4.173877895734875e-06, + "loss": 0.3597, + "step": 494 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 2.4037158489227295, + "learning_rate": 4.1707221485591764e-06, + "loss": 0.3284, + "step": 495 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 3.049436092376709, + "learning_rate": 4.167561583454272e-06, + "loss": 0.257, + "step": 496 + }, + { + "epoch": 2.6864864864864866, + "grad_norm": 3.458923816680908, + "learning_rate": 4.164396209534411e-06, + "loss": 0.1819, + "step": 497 + }, + { + "epoch": 2.691891891891892, + "grad_norm": 3.3084232807159424, + "learning_rate": 4.161226035927711e-06, + "loss": 0.7109, + "step": 498 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 3.034550189971924, + "learning_rate": 4.15805107177613e-06, + "loss": 0.6297, + "step": 499 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 3.5786449909210205, + "learning_rate": 4.15487132623544e-06, + "loss": 0.5195, + "step": 500 + }, + { + "epoch": 2.708108108108108, + "grad_norm": 3.4477646350860596, + "learning_rate": 4.151686808475204e-06, + "loss": 0.2528, + "step": 501 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 3.0256869792938232, + "learning_rate": 4.148497527678744e-06, + "loss": 0.5013, + "step": 502 + }, + { + "epoch": 2.718918918918919, + "grad_norm": 2.875121593475342, + "learning_rate": 4.145303493043118e-06, + "loss": 0.4109, + "step": 503 + }, + { + "epoch": 2.7243243243243245, + "grad_norm": 2.7204222679138184, + "learning_rate": 4.1421047137790935e-06, + "loss": 0.3197, + "step": 504 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 3.350482702255249, + "learning_rate": 4.13890119911112e-06, + "loss": 0.6369, + "step": 505 + }, + { + "epoch": 2.735135135135135, + "grad_norm": 3.096774101257324, + "learning_rate": 4.135692958277303e-06, + "loss": 0.4581, + "step": 506 + }, + { + "epoch": 2.7405405405405405, + "grad_norm": 2.8896536827087402, + "learning_rate": 4.132480000529375e-06, + "loss": 0.6217, + "step": 507 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 2.643932580947876, + "learning_rate": 4.129262335132676e-06, + "loss": 0.4951, + "step": 508 + }, + { + "epoch": 2.7513513513513512, + "grad_norm": 2.6077864170074463, + "learning_rate": 4.126039971366114e-06, + "loss": 0.2185, + "step": 509 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 2.531507968902588, + "learning_rate": 4.122812918522154e-06, + "loss": 0.5428, + "step": 510 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 4.125836372375488, + "learning_rate": 4.119581185906776e-06, + "loss": 0.5466, + "step": 511 + }, + { + "epoch": 2.7675675675675677, + "grad_norm": 2.9921016693115234, + "learning_rate": 4.1163447828394595e-06, + "loss": 0.3803, + "step": 512 + }, + { + "epoch": 2.772972972972973, + "grad_norm": 2.9517931938171387, + "learning_rate": 4.113103718653152e-06, + "loss": 0.2722, + "step": 513 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 2.8333382606506348, + "learning_rate": 4.10985800269424e-06, + "loss": 0.333, + "step": 514 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 2.94168758392334, + "learning_rate": 4.106607644322529e-06, + "loss": 0.2186, + "step": 515 + }, + { + "epoch": 2.789189189189189, + "grad_norm": 3.2743892669677734, + "learning_rate": 4.103352652911207e-06, + "loss": 0.6365, + "step": 516 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 4.692770004272461, + "learning_rate": 4.100093037846825e-06, + "loss": 0.7261, + "step": 517 + }, + { + "epoch": 2.8, + "grad_norm": 3.2157247066497803, + "learning_rate": 4.0968288085292675e-06, + "loss": 0.2767, + "step": 518 + }, + { + "epoch": 2.8054054054054056, + "grad_norm": 3.196887731552124, + "learning_rate": 4.093559974371725e-06, + "loss": 0.4743, + "step": 519 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 2.406752586364746, + "learning_rate": 4.090286544800667e-06, + "loss": 0.3789, + "step": 520 + }, + { + "epoch": 2.8162162162162163, + "grad_norm": 3.1769447326660156, + "learning_rate": 4.087008529255815e-06, + "loss": 0.6252, + "step": 521 + }, + { + "epoch": 2.8216216216216217, + "grad_norm": 3.068370819091797, + "learning_rate": 4.083725937190115e-06, + "loss": 0.3467, + "step": 522 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 3.2665855884552, + "learning_rate": 4.0804387780697114e-06, + "loss": 0.3857, + "step": 523 + }, + { + "epoch": 2.8324324324324324, + "grad_norm": 3.368759870529175, + "learning_rate": 4.077147061373918e-06, + "loss": 0.4679, + "step": 524 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 3.989163875579834, + "learning_rate": 4.073850796595192e-06, + "loss": 0.2439, + "step": 525 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 3.6244685649871826, + "learning_rate": 4.070549993239106e-06, + "loss": 0.435, + "step": 526 + }, + { + "epoch": 2.8486486486486484, + "grad_norm": 3.585151195526123, + "learning_rate": 4.06724466082432e-06, + "loss": 0.5022, + "step": 527 + }, + { + "epoch": 2.854054054054054, + "grad_norm": 3.2420976161956787, + "learning_rate": 4.063934808882555e-06, + "loss": 0.4282, + "step": 528 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 3.1674294471740723, + "learning_rate": 4.0606204469585656e-06, + "loss": 0.3436, + "step": 529 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 2.6856706142425537, + "learning_rate": 4.057301584610112e-06, + "loss": 0.3889, + "step": 530 + }, + { + "epoch": 2.8702702702702703, + "grad_norm": 3.0438942909240723, + "learning_rate": 4.053978231407931e-06, + "loss": 0.4828, + "step": 531 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 3.3561246395111084, + "learning_rate": 4.0506503969357115e-06, + "loss": 0.5814, + "step": 532 + }, + { + "epoch": 2.881081081081081, + "grad_norm": 2.5318350791931152, + "learning_rate": 4.047318090790065e-06, + "loss": 0.4768, + "step": 533 + }, + { + "epoch": 2.8864864864864863, + "grad_norm": 2.587224006652832, + "learning_rate": 4.043981322580498e-06, + "loss": 0.4262, + "step": 534 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 2.73926043510437, + "learning_rate": 4.040640101929384e-06, + "loss": 0.421, + "step": 535 + }, + { + "epoch": 2.8972972972972975, + "grad_norm": 3.53908371925354, + "learning_rate": 4.037294438471936e-06, + "loss": 0.4019, + "step": 536 + }, + { + "epoch": 2.902702702702703, + "grad_norm": 3.0980448722839355, + "learning_rate": 4.033944341856181e-06, + "loss": 0.4322, + "step": 537 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 2.9265666007995605, + "learning_rate": 4.030589821742926e-06, + "loss": 0.3841, + "step": 538 + }, + { + "epoch": 2.9135135135135135, + "grad_norm": 3.4082043170928955, + "learning_rate": 4.0272308878057385e-06, + "loss": 0.7083, + "step": 539 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 3.297515630722046, + "learning_rate": 4.023867549730912e-06, + "loss": 0.5688, + "step": 540 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 3.0538225173950195, + "learning_rate": 4.020499817217441e-06, + "loss": 0.5979, + "step": 541 + }, + { + "epoch": 2.92972972972973, + "grad_norm": 3.1792757511138916, + "learning_rate": 4.017127699976992e-06, + "loss": 0.5034, + "step": 542 + }, + { + "epoch": 2.935135135135135, + "grad_norm": 3.1574482917785645, + "learning_rate": 4.013751207733877e-06, + "loss": 0.6656, + "step": 543 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 2.523123264312744, + "learning_rate": 4.010370350225023e-06, + "loss": 0.2789, + "step": 544 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 3.1950793266296387, + "learning_rate": 4.006985137199945e-06, + "loss": 0.2163, + "step": 545 + }, + { + "epoch": 2.9513513513513514, + "grad_norm": 3.2089648246765137, + "learning_rate": 4.00359557842072e-06, + "loss": 0.4179, + "step": 546 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 3.852578639984131, + "learning_rate": 4.000201683661958e-06, + "loss": 0.4683, + "step": 547 + }, + { + "epoch": 2.962162162162162, + "grad_norm": 2.7612597942352295, + "learning_rate": 3.996803462710766e-06, + "loss": 0.3506, + "step": 548 + }, + { + "epoch": 2.9675675675675675, + "grad_norm": 4.811823844909668, + "learning_rate": 3.993400925366736e-06, + "loss": 0.6582, + "step": 549 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 3.0135858058929443, + "learning_rate": 3.989994081441902e-06, + "loss": 0.504, + "step": 550 + }, + { + "epoch": 2.9783783783783786, + "grad_norm": 2.710277795791626, + "learning_rate": 3.986582940760717e-06, + "loss": 0.7362, + "step": 551 + }, + { + "epoch": 2.983783783783784, + "grad_norm": 3.175443649291992, + "learning_rate": 3.983167513160025e-06, + "loss": 0.4116, + "step": 552 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 3.101109743118286, + "learning_rate": 3.979747808489036e-06, + "loss": 0.2188, + "step": 553 + }, + { + "epoch": 2.9945945945945946, + "grad_norm": 3.2320079803466797, + "learning_rate": 3.976323836609289e-06, + "loss": 0.7558, + "step": 554 + }, + { + "epoch": 3.0, + "grad_norm": 3.6071934700012207, + "learning_rate": 3.9728956073946305e-06, + "loss": 0.6491, + "step": 555 + }, + { + "epoch": 3.0054054054054054, + "grad_norm": 3.1119353771209717, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1625, + "step": 556 + }, + { + "epoch": 3.0108108108108107, + "grad_norm": 3.0440328121185303, + "learning_rate": 3.966026416517321e-06, + "loss": 0.311, + "step": 557 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 4.069122791290283, + "learning_rate": 3.962585474663636e-06, + "loss": 0.5299, + "step": 558 + }, + { + "epoch": 3.0216216216216214, + "grad_norm": 2.878645896911621, + "learning_rate": 3.959140315092911e-06, + "loss": 0.2718, + "step": 559 + }, + { + "epoch": 3.027027027027027, + "grad_norm": 3.526695966720581, + "learning_rate": 3.955690947740092e-06, + "loss": 0.2954, + "step": 560 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 3.25087308883667, + "learning_rate": 3.95223738255226e-06, + "loss": 0.2388, + "step": 561 + }, + { + "epoch": 3.037837837837838, + "grad_norm": 3.5467700958251953, + "learning_rate": 3.9487796294886015e-06, + "loss": 0.2014, + "step": 562 + }, + { + "epoch": 3.0432432432432432, + "grad_norm": 4.397517681121826, + "learning_rate": 3.945317698520379e-06, + "loss": 0.2102, + "step": 563 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 3.7297182083129883, + "learning_rate": 3.941851599630903e-06, + "loss": 0.499, + "step": 564 + }, + { + "epoch": 3.054054054054054, + "grad_norm": 4.417158603668213, + "learning_rate": 3.938381342815503e-06, + "loss": 0.3392, + "step": 565 + }, + { + "epoch": 3.0594594594594593, + "grad_norm": 4.6037421226501465, + "learning_rate": 3.934906938081499e-06, + "loss": 0.1942, + "step": 566 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 3.5600531101226807, + "learning_rate": 3.931428395448174e-06, + "loss": 0.1753, + "step": 567 + }, + { + "epoch": 3.0702702702702704, + "grad_norm": 2.868013381958008, + "learning_rate": 3.927945724946743e-06, + "loss": 0.2959, + "step": 568 + }, + { + "epoch": 3.075675675675676, + "grad_norm": 3.5543227195739746, + "learning_rate": 3.924458936620322e-06, + "loss": 0.4625, + "step": 569 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 8.972922325134277, + "learning_rate": 3.920968040523904e-06, + "loss": 0.2571, + "step": 570 + }, + { + "epoch": 3.0864864864864865, + "grad_norm": 3.037388324737549, + "learning_rate": 3.917473046724329e-06, + "loss": 0.1438, + "step": 571 + }, + { + "epoch": 3.091891891891892, + "grad_norm": 3.3261702060699463, + "learning_rate": 3.9139739653002525e-06, + "loss": 0.3572, + "step": 572 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 2.425293207168579, + "learning_rate": 3.910470806342117e-06, + "loss": 0.165, + "step": 573 + }, + { + "epoch": 3.1027027027027025, + "grad_norm": 3.5718603134155273, + "learning_rate": 3.9069635799521245e-06, + "loss": 0.3209, + "step": 574 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 3.8211171627044678, + "learning_rate": 3.903452296244204e-06, + "loss": 0.1976, + "step": 575 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 5.944535255432129, + "learning_rate": 3.899936965343989e-06, + "loss": 0.6074, + "step": 576 + }, + { + "epoch": 3.118918918918919, + "grad_norm": 6.603860378265381, + "learning_rate": 3.89641759738878e-06, + "loss": 0.4051, + "step": 577 + }, + { + "epoch": 3.1243243243243244, + "grad_norm": 6.712981700897217, + "learning_rate": 3.892894202527523e-06, + "loss": 0.3787, + "step": 578 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 3.267186403274536, + "learning_rate": 3.8893667909207735e-06, + "loss": 0.0927, + "step": 579 + }, + { + "epoch": 3.135135135135135, + "grad_norm": 4.476837158203125, + "learning_rate": 3.88583537274067e-06, + "loss": 0.4706, + "step": 580 + }, + { + "epoch": 3.1405405405405404, + "grad_norm": 4.272335052490234, + "learning_rate": 3.8822999581709085e-06, + "loss": 0.3949, + "step": 581 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 3.6685309410095215, + "learning_rate": 3.878760557406708e-06, + "loss": 0.1971, + "step": 582 + }, + { + "epoch": 3.1513513513513516, + "grad_norm": 3.9899449348449707, + "learning_rate": 3.875217180654779e-06, + "loss": 0.5156, + "step": 583 + }, + { + "epoch": 3.156756756756757, + "grad_norm": 3.866804361343384, + "learning_rate": 3.871669838133303e-06, + "loss": 0.3552, + "step": 584 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 3.565648317337036, + "learning_rate": 3.868118540071894e-06, + "loss": 0.4369, + "step": 585 + }, + { + "epoch": 3.1675675675675676, + "grad_norm": 3.5073986053466797, + "learning_rate": 3.8645632967115755e-06, + "loss": 0.3694, + "step": 586 + }, + { + "epoch": 3.172972972972973, + "grad_norm": 3.7636868953704834, + "learning_rate": 3.861004118304746e-06, + "loss": 0.3404, + "step": 587 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 2.940094232559204, + "learning_rate": 3.857441015115154e-06, + "loss": 0.3086, + "step": 588 + }, + { + "epoch": 3.1837837837837837, + "grad_norm": 3.727414608001709, + "learning_rate": 3.8538739974178635e-06, + "loss": 0.253, + "step": 589 + }, + { + "epoch": 3.189189189189189, + "grad_norm": 3.5140156745910645, + "learning_rate": 3.850303075499227e-06, + "loss": 0.2436, + "step": 590 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 3.545952558517456, + "learning_rate": 3.84672825965686e-06, + "loss": 0.328, + "step": 591 + }, + { + "epoch": 3.2, + "grad_norm": 3.534240484237671, + "learning_rate": 3.843149560199601e-06, + "loss": 0.2687, + "step": 592 + }, + { + "epoch": 3.2054054054054055, + "grad_norm": 2.8464927673339844, + "learning_rate": 3.839566987447492e-06, + "loss": 0.1417, + "step": 593 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 4.138559818267822, + "learning_rate": 3.835980551731743e-06, + "loss": 0.2106, + "step": 594 + }, + { + "epoch": 3.2162162162162162, + "grad_norm": 2.917670249938965, + "learning_rate": 3.8323902633947045e-06, + "loss": 0.3154, + "step": 595 + }, + { + "epoch": 3.2216216216216216, + "grad_norm": 3.029660224914551, + "learning_rate": 3.828796132789835e-06, + "loss": 0.1218, + "step": 596 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 3.2845771312713623, + "learning_rate": 3.825198170281677e-06, + "loss": 0.1336, + "step": 597 + }, + { + "epoch": 3.2324324324324323, + "grad_norm": 3.1375670433044434, + "learning_rate": 3.821596386245819e-06, + "loss": 0.2518, + "step": 598 + }, + { + "epoch": 3.237837837837838, + "grad_norm": 3.0021941661834717, + "learning_rate": 3.817990791068874e-06, + "loss": 0.2762, + "step": 599 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 4.141000747680664, + "learning_rate": 3.81438139514844e-06, + "loss": 0.2722, + "step": 600 + }, + { + "epoch": 3.2486486486486488, + "grad_norm": 3.9065279960632324, + "learning_rate": 3.8107682088930797e-06, + "loss": 0.3542, + "step": 601 + }, + { + "epoch": 3.254054054054054, + "grad_norm": 3.718417167663574, + "learning_rate": 3.807151242722286e-06, + "loss": 0.344, + "step": 602 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 4.013717174530029, + "learning_rate": 3.8035305070664484e-06, + "loss": 0.1625, + "step": 603 + }, + { + "epoch": 3.264864864864865, + "grad_norm": 3.348888397216797, + "learning_rate": 3.7999060123668318e-06, + "loss": 0.2925, + "step": 604 + }, + { + "epoch": 3.27027027027027, + "grad_norm": 3.496079206466675, + "learning_rate": 3.7962777690755364e-06, + "loss": 0.1523, + "step": 605 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 3.07607102394104, + "learning_rate": 3.792645787655476e-06, + "loss": 0.1674, + "step": 606 + }, + { + "epoch": 3.281081081081081, + "grad_norm": 3.4036154747009277, + "learning_rate": 3.7890100785803425e-06, + "loss": 0.2856, + "step": 607 + }, + { + "epoch": 3.2864864864864867, + "grad_norm": 6.092559337615967, + "learning_rate": 3.785370652334577e-06, + "loss": 0.1094, + "step": 608 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 3.9322001934051514, + "learning_rate": 3.7817275194133403e-06, + "loss": 0.2611, + "step": 609 + }, + { + "epoch": 3.2972972972972974, + "grad_norm": 3.189563274383545, + "learning_rate": 3.778080690322483e-06, + "loss": 0.1315, + "step": 610 + }, + { + "epoch": 3.3027027027027027, + "grad_norm": 4.304934024810791, + "learning_rate": 3.774430175578514e-06, + "loss": 0.1686, + "step": 611 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 2.9030067920684814, + "learning_rate": 3.7707759857085706e-06, + "loss": 0.4642, + "step": 612 + }, + { + "epoch": 3.3135135135135134, + "grad_norm": 3.7485930919647217, + "learning_rate": 3.7671181312503886e-06, + "loss": 0.1987, + "step": 613 + }, + { + "epoch": 3.3189189189189188, + "grad_norm": 3.4700896739959717, + "learning_rate": 3.763456622752271e-06, + "loss": 0.3307, + "step": 614 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 3.0079376697540283, + "learning_rate": 3.7597914707730583e-06, + "loss": 0.1731, + "step": 615 + }, + { + "epoch": 3.32972972972973, + "grad_norm": 3.155235767364502, + "learning_rate": 3.7561226858820984e-06, + "loss": 0.2003, + "step": 616 + }, + { + "epoch": 3.3351351351351353, + "grad_norm": 3.847895622253418, + "learning_rate": 3.7524502786592143e-06, + "loss": 0.4014, + "step": 617 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 2.7505502700805664, + "learning_rate": 3.7487742596946753e-06, + "loss": 0.205, + "step": 618 + }, + { + "epoch": 3.345945945945946, + "grad_norm": 3.654529571533203, + "learning_rate": 3.7450946395891674e-06, + "loss": 0.2932, + "step": 619 + }, + { + "epoch": 3.3513513513513513, + "grad_norm": 2.9763967990875244, + "learning_rate": 3.7414114289537593e-06, + "loss": 0.2748, + "step": 620 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 3.889683961868286, + "learning_rate": 3.7377246384098763e-06, + "loss": 0.3665, + "step": 621 + }, + { + "epoch": 3.362162162162162, + "grad_norm": 4.193166732788086, + "learning_rate": 3.7340342785892645e-06, + "loss": 0.3453, + "step": 622 + }, + { + "epoch": 3.3675675675675674, + "grad_norm": 3.4371488094329834, + "learning_rate": 3.7303403601339646e-06, + "loss": 0.473, + "step": 623 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 3.6939027309417725, + "learning_rate": 3.726642893696279e-06, + "loss": 0.3017, + "step": 624 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 4.904304504394531, + "learning_rate": 3.7229418899387414e-06, + "loss": 0.4841, + "step": 625 + }, + { + "epoch": 3.383783783783784, + "grad_norm": 3.6373438835144043, + "learning_rate": 3.719237359534087e-06, + "loss": 0.3879, + "step": 626 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 3.403676986694336, + "learning_rate": 3.71552931316522e-06, + "loss": 0.3876, + "step": 627 + }, + { + "epoch": 3.3945945945945946, + "grad_norm": 3.2292237281799316, + "learning_rate": 3.7118177615251834e-06, + "loss": 0.4491, + "step": 628 + }, + { + "epoch": 3.4, + "grad_norm": 3.317850351333618, + "learning_rate": 3.70810271531713e-06, + "loss": 0.3763, + "step": 629 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 3.664735794067383, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4171, + "step": 630 + }, + { + "epoch": 3.410810810810811, + "grad_norm": 3.781569242477417, + "learning_rate": 3.700662182059936e-06, + "loss": 0.2445, + "step": 631 + }, + { + "epoch": 3.4162162162162164, + "grad_norm": 2.878260850906372, + "learning_rate": 3.696936716467363e-06, + "loss": 0.1347, + "step": 632 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 2.8670761585235596, + "learning_rate": 3.693207799219846e-06, + "loss": 0.2822, + "step": 633 + }, + { + "epoch": 3.427027027027027, + "grad_norm": 3.9338245391845703, + "learning_rate": 3.689475441070615e-06, + "loss": 0.3425, + "step": 634 + }, + { + "epoch": 3.4324324324324325, + "grad_norm": 3.3172149658203125, + "learning_rate": 3.685739652782822e-06, + "loss": 0.3315, + "step": 635 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 3.9986648559570312, + "learning_rate": 3.682000445129512e-06, + "loss": 0.1841, + "step": 636 + }, + { + "epoch": 3.443243243243243, + "grad_norm": 3.4503986835479736, + "learning_rate": 3.6782578288935896e-06, + "loss": 0.3151, + "step": 637 + }, + { + "epoch": 3.4486486486486485, + "grad_norm": 3.8826167583465576, + "learning_rate": 3.6745118148677882e-06, + "loss": 0.1272, + "step": 638 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 3.0585904121398926, + "learning_rate": 3.6707624138546414e-06, + "loss": 0.2436, + "step": 639 + }, + { + "epoch": 3.4594594594594597, + "grad_norm": 3.8409557342529297, + "learning_rate": 3.6670096366664477e-06, + "loss": 0.6321, + "step": 640 + }, + { + "epoch": 3.464864864864865, + "grad_norm": 3.7260093688964844, + "learning_rate": 3.663253494125244e-06, + "loss": 0.1262, + "step": 641 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 3.195587396621704, + "learning_rate": 3.6594939970627706e-06, + "loss": 0.2669, + "step": 642 + }, + { + "epoch": 3.4756756756756757, + "grad_norm": 2.565070629119873, + "learning_rate": 3.655731156320441e-06, + "loss": 0.1228, + "step": 643 + }, + { + "epoch": 3.481081081081081, + "grad_norm": 3.745422124862671, + "learning_rate": 3.651964982749312e-06, + "loss": 0.1759, + "step": 644 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 4.96168327331543, + "learning_rate": 3.648195487210051e-06, + "loss": 0.5677, + "step": 645 + }, + { + "epoch": 3.4918918918918918, + "grad_norm": 3.514446496963501, + "learning_rate": 3.644422680572906e-06, + "loss": 0.1874, + "step": 646 + }, + { + "epoch": 3.4972972972972975, + "grad_norm": 3.1427719593048096, + "learning_rate": 3.640646573717671e-06, + "loss": 0.3225, + "step": 647 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 3.32208514213562, + "learning_rate": 3.63686717753366e-06, + "loss": 0.102, + "step": 648 + }, + { + "epoch": 3.5081081081081082, + "grad_norm": 3.409299373626709, + "learning_rate": 3.6330845029196697e-06, + "loss": 0.1585, + "step": 649 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 2.827052116394043, + "learning_rate": 3.629298560783952e-06, + "loss": 0.3046, + "step": 650 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 3.541518211364746, + "learning_rate": 3.6255093620441835e-06, + "loss": 0.2037, + "step": 651 + }, + { + "epoch": 3.5243243243243243, + "grad_norm": 3.067040205001831, + "learning_rate": 3.6217169176274293e-06, + "loss": 0.1784, + "step": 652 + }, + { + "epoch": 3.5297297297297296, + "grad_norm": 4.001040935516357, + "learning_rate": 3.6179212384701146e-06, + "loss": 0.1974, + "step": 653 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 4.03037691116333, + "learning_rate": 3.6141223355179946e-06, + "loss": 0.2161, + "step": 654 + }, + { + "epoch": 3.5405405405405403, + "grad_norm": 3.303591728210449, + "learning_rate": 3.610320219726118e-06, + "loss": 0.1487, + "step": 655 + }, + { + "epoch": 3.545945945945946, + "grad_norm": 4.183008193969727, + "learning_rate": 3.606514902058802e-06, + "loss": 0.2231, + "step": 656 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 4.2100300788879395, + "learning_rate": 3.602706393489594e-06, + "loss": 0.5068, + "step": 657 + }, + { + "epoch": 3.556756756756757, + "grad_norm": 4.521003246307373, + "learning_rate": 3.598894705001246e-06, + "loss": 0.4621, + "step": 658 + }, + { + "epoch": 3.562162162162162, + "grad_norm": 3.452348470687866, + "learning_rate": 3.5950798475856783e-06, + "loss": 0.285, + "step": 659 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 3.468987464904785, + "learning_rate": 3.5912618322439487e-06, + "loss": 0.4277, + "step": 660 + }, + { + "epoch": 3.572972972972973, + "grad_norm": 3.431551933288574, + "learning_rate": 3.587440669986224e-06, + "loss": 0.1993, + "step": 661 + }, + { + "epoch": 3.5783783783783782, + "grad_norm": 3.017648220062256, + "learning_rate": 3.5836163718317453e-06, + "loss": 0.272, + "step": 662 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 3.837244987487793, + "learning_rate": 3.5797889488087946e-06, + "loss": 0.6019, + "step": 663 + }, + { + "epoch": 3.589189189189189, + "grad_norm": 3.221762180328369, + "learning_rate": 3.575958411954668e-06, + "loss": 0.3603, + "step": 664 + }, + { + "epoch": 3.5945945945945947, + "grad_norm": 4.279484272003174, + "learning_rate": 3.5721247723156393e-06, + "loss": 0.4656, + "step": 665 + }, + { + "epoch": 3.6, + "grad_norm": 3.723459243774414, + "learning_rate": 3.5682880409469316e-06, + "loss": 0.2466, + "step": 666 + }, + { + "epoch": 3.6054054054054054, + "grad_norm": 2.7260632514953613, + "learning_rate": 3.564448228912682e-06, + "loss": 0.1848, + "step": 667 + }, + { + "epoch": 3.610810810810811, + "grad_norm": 3.6656649112701416, + "learning_rate": 3.5606053472859124e-06, + "loss": 0.4968, + "step": 668 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 4.570294380187988, + "learning_rate": 3.556759407148496e-06, + "loss": 0.316, + "step": 669 + }, + { + "epoch": 3.6216216216216215, + "grad_norm": 3.174433946609497, + "learning_rate": 3.5529104195911258e-06, + "loss": 0.2232, + "step": 670 + }, + { + "epoch": 3.627027027027027, + "grad_norm": 4.481954574584961, + "learning_rate": 3.549058395713285e-06, + "loss": 0.4435, + "step": 671 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 3.8758301734924316, + "learning_rate": 3.54520334662321e-06, + "loss": 0.1455, + "step": 672 + }, + { + "epoch": 3.637837837837838, + "grad_norm": 3.1699628829956055, + "learning_rate": 3.5413452834378626e-06, + "loss": 0.3037, + "step": 673 + }, + { + "epoch": 3.6432432432432433, + "grad_norm": 3.8971962928771973, + "learning_rate": 3.5374842172828953e-06, + "loss": 0.4309, + "step": 674 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 3.3087549209594727, + "learning_rate": 3.533620159292621e-06, + "loss": 0.383, + "step": 675 + }, + { + "epoch": 3.654054054054054, + "grad_norm": 2.9413082599639893, + "learning_rate": 3.529753120609982e-06, + "loss": 0.1963, + "step": 676 + }, + { + "epoch": 3.6594594594594594, + "grad_norm": 3.309837818145752, + "learning_rate": 3.5258831123865136e-06, + "loss": 0.1922, + "step": 677 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 4.124879360198975, + "learning_rate": 3.5220101457823147e-06, + "loss": 0.5589, + "step": 678 + }, + { + "epoch": 3.6702702702702705, + "grad_norm": 3.2587103843688965, + "learning_rate": 3.5181342319660174e-06, + "loss": 0.1757, + "step": 679 + }, + { + "epoch": 3.6756756756756754, + "grad_norm": 4.179666042327881, + "learning_rate": 3.5142553821147498e-06, + "loss": 0.1208, + "step": 680 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 3.4041192531585693, + "learning_rate": 3.5103736074141106e-06, + "loss": 0.2416, + "step": 681 + }, + { + "epoch": 3.6864864864864866, + "grad_norm": 4.982706546783447, + "learning_rate": 3.5064889190581293e-06, + "loss": 0.3841, + "step": 682 + }, + { + "epoch": 3.691891891891892, + "grad_norm": 3.5895309448242188, + "learning_rate": 3.5026013282492406e-06, + "loss": 0.3723, + "step": 683 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 3.4824306964874268, + "learning_rate": 3.498710846198247e-06, + "loss": 0.4403, + "step": 684 + }, + { + "epoch": 3.7027027027027026, + "grad_norm": 3.501023054122925, + "learning_rate": 3.494817484124289e-06, + "loss": 0.2813, + "step": 685 + }, + { + "epoch": 3.708108108108108, + "grad_norm": 3.934908151626587, + "learning_rate": 3.490921253254813e-06, + "loss": 0.4287, + "step": 686 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 3.24141526222229, + "learning_rate": 3.487022164825539e-06, + "loss": 0.234, + "step": 687 + }, + { + "epoch": 3.718918918918919, + "grad_norm": 3.3419880867004395, + "learning_rate": 3.4831202300804246e-06, + "loss": 0.2135, + "step": 688 + }, + { + "epoch": 3.7243243243243245, + "grad_norm": 3.923778772354126, + "learning_rate": 3.479215460271638e-06, + "loss": 0.2725, + "step": 689 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 3.2432096004486084, + "learning_rate": 3.475307866659522e-06, + "loss": 0.228, + "step": 690 + }, + { + "epoch": 3.735135135135135, + "grad_norm": 3.0307705402374268, + "learning_rate": 3.4713974605125634e-06, + "loss": 0.0985, + "step": 691 + }, + { + "epoch": 3.7405405405405405, + "grad_norm": 2.778942346572876, + "learning_rate": 3.4674842531073587e-06, + "loss": 0.2137, + "step": 692 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 3.711315155029297, + "learning_rate": 3.4635682557285833e-06, + "loss": 0.1707, + "step": 693 + }, + { + "epoch": 3.7513513513513512, + "grad_norm": 3.165668487548828, + "learning_rate": 3.459649479668956e-06, + "loss": 0.3021, + "step": 694 + }, + { + "epoch": 3.756756756756757, + "grad_norm": 3.7491254806518555, + "learning_rate": 3.4557279362292117e-06, + "loss": 0.3457, + "step": 695 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 3.271603584289551, + "learning_rate": 3.451803636718064e-06, + "loss": 0.1193, + "step": 696 + }, + { + "epoch": 3.7675675675675677, + "grad_norm": 3.872382402420044, + "learning_rate": 3.447876592452174e-06, + "loss": 0.2261, + "step": 697 + }, + { + "epoch": 3.772972972972973, + "grad_norm": 4.634008407592773, + "learning_rate": 3.4439468147561196e-06, + "loss": 0.5042, + "step": 698 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 3.6930148601531982, + "learning_rate": 3.440014314962358e-06, + "loss": 0.3481, + "step": 699 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 4.709466457366943, + "learning_rate": 3.4360791044112e-06, + "loss": 0.2317, + "step": 700 + }, + { + "epoch": 3.789189189189189, + "grad_norm": 4.37923002243042, + "learning_rate": 3.432141194450772e-06, + "loss": 0.395, + "step": 701 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 3.1600489616394043, + "learning_rate": 3.4282005964369836e-06, + "loss": 0.1767, + "step": 702 + }, + { + "epoch": 3.8, + "grad_norm": 3.9799487590789795, + "learning_rate": 3.424257321733497e-06, + "loss": 0.2146, + "step": 703 + }, + { + "epoch": 3.8054054054054056, + "grad_norm": 2.79176664352417, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.1534, + "step": 704 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 3.0024254322052, + "learning_rate": 3.4163627877506434e-06, + "loss": 0.2513, + "step": 705 + }, + { + "epoch": 3.8162162162162163, + "grad_norm": 2.924475908279419, + "learning_rate": 3.4124115512370636e-06, + "loss": 0.4154, + "step": 706 + }, + { + "epoch": 3.8216216216216217, + "grad_norm": 3.2713992595672607, + "learning_rate": 3.408457683565295e-06, + "loss": 0.1822, + "step": 707 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 3.094003438949585, + "learning_rate": 3.4045011961372675e-06, + "loss": 0.3589, + "step": 708 + }, + { + "epoch": 3.8324324324324324, + "grad_norm": 3.423858404159546, + "learning_rate": 3.4005421003624637e-06, + "loss": 0.4615, + "step": 709 + }, + { + "epoch": 3.8378378378378377, + "grad_norm": 2.038792848587036, + "learning_rate": 3.3965804076578896e-06, + "loss": 0.1001, + "step": 710 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 2.6447055339813232, + "learning_rate": 3.392616129448039e-06, + "loss": 0.2788, + "step": 711 + }, + { + "epoch": 3.8486486486486484, + "grad_norm": 3.546876907348633, + "learning_rate": 3.3886492771648593e-06, + "loss": 0.2663, + "step": 712 + }, + { + "epoch": 3.854054054054054, + "grad_norm": 2.9587066173553467, + "learning_rate": 3.384679862247726e-06, + "loss": 0.3497, + "step": 713 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 3.7122113704681396, + "learning_rate": 3.3807078961434013e-06, + "loss": 0.3613, + "step": 714 + }, + { + "epoch": 3.864864864864865, + "grad_norm": 3.157294988632202, + "learning_rate": 3.376733390306004e-06, + "loss": 0.0783, + "step": 715 + }, + { + "epoch": 3.8702702702702703, + "grad_norm": 3.564279317855835, + "learning_rate": 3.372756356196979e-06, + "loss": 0.1617, + "step": 716 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 4.231864929199219, + "learning_rate": 3.3687768052850595e-06, + "loss": 0.6444, + "step": 717 + }, + { + "epoch": 3.881081081081081, + "grad_norm": 5.480365753173828, + "learning_rate": 3.364794749046239e-06, + "loss": 0.4858, + "step": 718 + }, + { + "epoch": 3.8864864864864863, + "grad_norm": 3.428140878677368, + "learning_rate": 3.3608101989637333e-06, + "loss": 0.3103, + "step": 719 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 3.521989345550537, + "learning_rate": 3.356823166527952e-06, + "loss": 0.2501, + "step": 720 + }, + { + "epoch": 3.8972972972972975, + "grad_norm": 3.287081718444824, + "learning_rate": 3.352833663236463e-06, + "loss": 0.18, + "step": 721 + }, + { + "epoch": 3.902702702702703, + "grad_norm": 3.323146104812622, + "learning_rate": 3.348841700593956e-06, + "loss": 0.12, + "step": 722 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 3.516693115234375, + "learning_rate": 3.3448472901122187e-06, + "loss": 0.2618, + "step": 723 + }, + { + "epoch": 3.9135135135135135, + "grad_norm": 3.8109545707702637, + "learning_rate": 3.340850443310092e-06, + "loss": 0.3689, + "step": 724 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 3.8335933685302734, + "learning_rate": 3.336851171713447e-06, + "loss": 0.2195, + "step": 725 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 3.9054670333862305, + "learning_rate": 3.3328494868551444e-06, + "loss": 0.2602, + "step": 726 + }, + { + "epoch": 3.92972972972973, + "grad_norm": 3.1380631923675537, + "learning_rate": 3.3288454002750046e-06, + "loss": 0.1561, + "step": 727 + }, + { + "epoch": 3.935135135135135, + "grad_norm": 4.304198741912842, + "learning_rate": 3.3248389235197764e-06, + "loss": 0.4469, + "step": 728 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 3.3321573734283447, + "learning_rate": 3.3208300681430967e-06, + "loss": 0.2246, + "step": 729 + }, + { + "epoch": 3.945945945945946, + "grad_norm": 3.89400315284729, + "learning_rate": 3.3168188457054656e-06, + "loss": 0.2743, + "step": 730 + }, + { + "epoch": 3.9513513513513514, + "grad_norm": 3.393209934234619, + "learning_rate": 3.312805267774209e-06, + "loss": 0.551, + "step": 731 + }, + { + "epoch": 3.9567567567567568, + "grad_norm": 3.711652994155884, + "learning_rate": 3.3087893459234423e-06, + "loss": 0.3522, + "step": 732 + }, + { + "epoch": 3.962162162162162, + "grad_norm": 3.6701200008392334, + "learning_rate": 3.304771091734043e-06, + "loss": 0.3084, + "step": 733 + }, + { + "epoch": 3.9675675675675675, + "grad_norm": 3.1742889881134033, + "learning_rate": 3.300750516793614e-06, + "loss": 0.3406, + "step": 734 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 4.000397682189941, + "learning_rate": 3.2967276326964504e-06, + "loss": 0.3463, + "step": 735 + }, + { + "epoch": 3.9783783783783786, + "grad_norm": 3.7932708263397217, + "learning_rate": 3.2927024510435057e-06, + "loss": 0.3758, + "step": 736 + }, + { + "epoch": 3.983783783783784, + "grad_norm": 3.6258292198181152, + "learning_rate": 3.2886749834423587e-06, + "loss": 0.3328, + "step": 737 + }, + { + "epoch": 3.9891891891891893, + "grad_norm": 4.628194332122803, + "learning_rate": 3.284645241507183e-06, + "loss": 0.6213, + "step": 738 + }, + { + "epoch": 3.9945945945945946, + "grad_norm": 4.173697471618652, + "learning_rate": 3.280613236858707e-06, + "loss": 0.2463, + "step": 739 + }, + { + "epoch": 4.0, + "grad_norm": 2.9315719604492188, + "learning_rate": 3.2765789811241865e-06, + "loss": 0.3501, + "step": 740 + } + ], + "logging_steps": 1, + "max_steps": 1850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9969033700062003e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..39bd0c9f7fe30aea14eda194fee17703da4a4dbf --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5612543540085e09eed37e81b17ae51d1a6973 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.55.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f53ccb516e57388491adda6b9950bcfa872e93ae --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128009, + "transformers_version": "4.55.0", + "use_cache": false +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14daf4588e61b4e4983af0fccaba4d5500c0977c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6739fcd129e717b71b64001dcb25a03c143d66f5 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json @@ -0,0 +1,2076 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcff23dca6783edd2fd5334ab6ca46456e43ff6c --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json @@ -0,0 +1,6509 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 925, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005405405405405406, + "grad_norm": 72.60939025878906, + "learning_rate": 5e-06, + "loss": 2.9165, + "step": 1 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 29.01830291748047, + "learning_rate": 4.999996395324314e-06, + "loss": 1.9314, + "step": 2 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 21.44908332824707, + "learning_rate": 4.99998558130765e-06, + "loss": 1.5709, + "step": 3 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 4.490907669067383, + "learning_rate": 4.999967557981192e-06, + "loss": 0.8099, + "step": 4 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 4.000796794891357, + "learning_rate": 4.999942325396917e-06, + "loss": 0.9021, + "step": 5 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 18.513282775878906, + "learning_rate": 4.999909883627588e-06, + "loss": 1.7972, + "step": 6 + }, + { + "epoch": 0.03783783783783784, + "grad_norm": 3.5735981464385986, + "learning_rate": 4.999870232766757e-06, + "loss": 1.4306, + "step": 7 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 3.1145193576812744, + "learning_rate": 4.9998233729287696e-06, + "loss": 1.051, + "step": 8 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 3.856376886367798, + "learning_rate": 4.999769304248755e-06, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 4.05589485168457, + "learning_rate": 4.9997080268826344e-06, + "loss": 1.0999, + "step": 10 + }, + { + "epoch": 0.05945945945945946, + "grad_norm": 13.784229278564453, + "learning_rate": 4.9996395410071165e-06, + "loss": 1.2831, + "step": 11 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 6.079237937927246, + "learning_rate": 4.999563846819696e-06, + "loss": 1.2874, + "step": 12 + }, + { + "epoch": 0.07027027027027027, + "grad_norm": 4.5971245765686035, + "learning_rate": 4.999480944538655e-06, + "loss": 0.96, + "step": 13 + }, + { + "epoch": 0.07567567567567568, + "grad_norm": 4.916017532348633, + "learning_rate": 4.999390834403063e-06, + "loss": 0.9869, + "step": 14 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 3.2311055660247803, + "learning_rate": 4.999293516672773e-06, + "loss": 0.9293, + "step": 15 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 3.3040921688079834, + "learning_rate": 4.9991889916284255e-06, + "loss": 0.8914, + "step": 16 + }, + { + "epoch": 0.0918918918918919, + "grad_norm": 3.794267416000366, + "learning_rate": 4.999077259571442e-06, + "loss": 1.0176, + "step": 17 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 4.788509845733643, + "learning_rate": 4.998958320824031e-06, + "loss": 1.0259, + "step": 18 + }, + { + "epoch": 0.10270270270270271, + "grad_norm": 10.027527809143066, + "learning_rate": 4.998832175729179e-06, + "loss": 1.3356, + "step": 19 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 4.612483978271484, + "learning_rate": 4.998698824650656e-06, + "loss": 1.4486, + "step": 20 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 3.8676936626434326, + "learning_rate": 4.998558267973014e-06, + "loss": 0.8372, + "step": 21 + }, + { + "epoch": 0.11891891891891893, + "grad_norm": 2.9611001014709473, + "learning_rate": 4.998410506101579e-06, + "loss": 0.7931, + "step": 22 + }, + { + "epoch": 0.12432432432432433, + "grad_norm": 5.508745193481445, + "learning_rate": 4.9982555394624595e-06, + "loss": 1.3022, + "step": 23 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 3.434845209121704, + "learning_rate": 4.998093368502539e-06, + "loss": 0.9739, + "step": 24 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 4.736802101135254, + "learning_rate": 4.9979239936894765e-06, + "loss": 1.1154, + "step": 25 + }, + { + "epoch": 0.14054054054054055, + "grad_norm": 3.69411039352417, + "learning_rate": 4.997747415511705e-06, + "loss": 0.7543, + "step": 26 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 2.8646645545959473, + "learning_rate": 4.997563634478428e-06, + "loss": 0.7278, + "step": 27 + }, + { + "epoch": 0.15135135135135136, + "grad_norm": 6.56904935836792, + "learning_rate": 4.997372651119626e-06, + "loss": 0.8167, + "step": 28 + }, + { + "epoch": 0.15675675675675677, + "grad_norm": 2.955914258956909, + "learning_rate": 4.997174465986044e-06, + "loss": 0.8031, + "step": 29 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 2.5714259147644043, + "learning_rate": 4.996969079649196e-06, + "loss": 0.689, + "step": 30 + }, + { + "epoch": 0.16756756756756758, + "grad_norm": 3.5165364742279053, + "learning_rate": 4.996756492701362e-06, + "loss": 0.8059, + "step": 31 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 3.2861921787261963, + "learning_rate": 4.996536705755591e-06, + "loss": 0.9658, + "step": 32 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 2.962470531463623, + "learning_rate": 4.996309719445687e-06, + "loss": 0.8349, + "step": 33 + }, + { + "epoch": 0.1837837837837838, + "grad_norm": 2.7694804668426514, + "learning_rate": 4.996075534426223e-06, + "loss": 0.8287, + "step": 34 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 3.405071258544922, + "learning_rate": 4.995834151372526e-06, + "loss": 1.1211, + "step": 35 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 2.8680710792541504, + "learning_rate": 4.995585570980685e-06, + "loss": 1.0841, + "step": 36 + }, + { + "epoch": 0.2, + "grad_norm": 3.341021776199341, + "learning_rate": 4.995329793967537e-06, + "loss": 0.6182, + "step": 37 + }, + { + "epoch": 0.20540540540540542, + "grad_norm": 3.0639379024505615, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.7647, + "step": 38 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 3.225759983062744, + "learning_rate": 4.994796653048457e-06, + "loss": 0.8691, + "step": 39 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 4.56926155090332, + "learning_rate": 4.994519290679965e-06, + "loss": 1.0404, + "step": 40 + }, + { + "epoch": 0.22162162162162163, + "grad_norm": 4.871571063995361, + "learning_rate": 4.994234734765043e-06, + "loss": 1.1877, + "step": 41 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 3.672215700149536, + "learning_rate": 4.993942986124278e-06, + "loss": 0.959, + "step": 42 + }, + { + "epoch": 0.23243243243243245, + "grad_norm": 3.184683322906494, + "learning_rate": 4.9936440455989975e-06, + "loss": 0.9249, + "step": 43 + }, + { + "epoch": 0.23783783783783785, + "grad_norm": 2.7092034816741943, + "learning_rate": 4.993337914051266e-06, + "loss": 0.6899, + "step": 44 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 3.153764486312866, + "learning_rate": 4.99302459236389e-06, + "loss": 0.9075, + "step": 45 + }, + { + "epoch": 0.24864864864864866, + "grad_norm": 3.3629748821258545, + "learning_rate": 4.992704081440407e-06, + "loss": 0.785, + "step": 46 + }, + { + "epoch": 0.25405405405405407, + "grad_norm": 4.478365898132324, + "learning_rate": 4.992376382205088e-06, + "loss": 1.008, + "step": 47 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 3.4001641273498535, + "learning_rate": 4.992041495602932e-06, + "loss": 0.7751, + "step": 48 + }, + { + "epoch": 0.2648648648648649, + "grad_norm": 2.522662878036499, + "learning_rate": 4.991699422599664e-06, + "loss": 0.9022, + "step": 49 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.764458179473877, + "learning_rate": 4.991350164181735e-06, + "loss": 0.8801, + "step": 50 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 2.814859628677368, + "learning_rate": 4.990993721356317e-06, + "loss": 0.7045, + "step": 51 + }, + { + "epoch": 0.2810810810810811, + "grad_norm": 2.441311836242676, + "learning_rate": 4.990630095151296e-06, + "loss": 0.7312, + "step": 52 + }, + { + "epoch": 0.2864864864864865, + "grad_norm": 2.4443013668060303, + "learning_rate": 4.9902592866152765e-06, + "loss": 0.9609, + "step": 53 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 2.2934701442718506, + "learning_rate": 4.989881296817575e-06, + "loss": 0.5753, + "step": 54 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 2.6286847591400146, + "learning_rate": 4.989496126848215e-06, + "loss": 0.5118, + "step": 55 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 3.6817069053649902, + "learning_rate": 4.989103777817928e-06, + "loss": 1.1261, + "step": 56 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 3.011197566986084, + "learning_rate": 4.988704250858145e-06, + "loss": 0.7823, + "step": 57 + }, + { + "epoch": 0.31351351351351353, + "grad_norm": 2.5490806102752686, + "learning_rate": 4.988297547121e-06, + "loss": 0.6019, + "step": 58 + }, + { + "epoch": 0.31891891891891894, + "grad_norm": 3.0803146362304688, + "learning_rate": 4.98788366777932e-06, + "loss": 0.825, + "step": 59 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 3.015730619430542, + "learning_rate": 4.987462614026625e-06, + "loss": 0.7667, + "step": 60 + }, + { + "epoch": 0.32972972972972975, + "grad_norm": 2.5371594429016113, + "learning_rate": 4.987034387077126e-06, + "loss": 0.8051, + "step": 61 + }, + { + "epoch": 0.33513513513513515, + "grad_norm": 2.6414010524749756, + "learning_rate": 4.986598988165718e-06, + "loss": 0.6895, + "step": 62 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 3.065131187438965, + "learning_rate": 4.9861564185479785e-06, + "loss": 0.9268, + "step": 63 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 2.5708694458007812, + "learning_rate": 4.985706679500163e-06, + "loss": 0.9854, + "step": 64 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 2.768915891647339, + "learning_rate": 4.9852497723192025e-06, + "loss": 0.8083, + "step": 65 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 2.567901849746704, + "learning_rate": 4.9847856983227e-06, + "loss": 0.9098, + "step": 66 + }, + { + "epoch": 0.3621621621621622, + "grad_norm": 2.5766549110412598, + "learning_rate": 4.984314458848923e-06, + "loss": 0.8881, + "step": 67 + }, + { + "epoch": 0.3675675675675676, + "grad_norm": 2.9778389930725098, + "learning_rate": 4.983836055256804e-06, + "loss": 0.9877, + "step": 68 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 2.7225165367126465, + "learning_rate": 4.983350488925935e-06, + "loss": 0.8282, + "step": 69 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 2.702287197113037, + "learning_rate": 4.982857761256564e-06, + "loss": 1.1756, + "step": 70 + }, + { + "epoch": 0.3837837837837838, + "grad_norm": 2.9815568923950195, + "learning_rate": 4.982357873669589e-06, + "loss": 0.8114, + "step": 71 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 3.27150297164917, + "learning_rate": 4.981850827606556e-06, + "loss": 0.6763, + "step": 72 + }, + { + "epoch": 0.3945945945945946, + "grad_norm": 2.568423271179199, + "learning_rate": 4.981336624529655e-06, + "loss": 0.9372, + "step": 73 + }, + { + "epoch": 0.4, + "grad_norm": 2.621175527572632, + "learning_rate": 4.980815265921714e-06, + "loss": 1.0155, + "step": 74 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 2.62827205657959, + "learning_rate": 4.980286753286196e-06, + "loss": 0.949, + "step": 75 + }, + { + "epoch": 0.41081081081081083, + "grad_norm": 2.9462146759033203, + "learning_rate": 4.979751088147192e-06, + "loss": 1.0134, + "step": 76 + }, + { + "epoch": 0.41621621621621624, + "grad_norm": 2.814852714538574, + "learning_rate": 4.979208272049425e-06, + "loss": 0.9722, + "step": 77 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 4.177679538726807, + "learning_rate": 4.978658306558235e-06, + "loss": 1.2259, + "step": 78 + }, + { + "epoch": 0.42702702702702705, + "grad_norm": 2.813084125518799, + "learning_rate": 4.978101193259578e-06, + "loss": 0.834, + "step": 79 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.71824049949646, + "learning_rate": 4.977536933760025e-06, + "loss": 0.6151, + "step": 80 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 4.992153167724609, + "learning_rate": 4.976965529686755e-06, + "loss": 1.0475, + "step": 81 + }, + { + "epoch": 0.44324324324324327, + "grad_norm": 2.4810822010040283, + "learning_rate": 4.976386982687548e-06, + "loss": 0.8324, + "step": 82 + }, + { + "epoch": 0.4486486486486487, + "grad_norm": 4.509149074554443, + "learning_rate": 4.9758012944307845e-06, + "loss": 0.997, + "step": 83 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 3.114325761795044, + "learning_rate": 4.975208466605436e-06, + "loss": 1.2024, + "step": 84 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 3.297091007232666, + "learning_rate": 4.974608500921064e-06, + "loss": 0.9146, + "step": 85 + }, + { + "epoch": 0.4648648648648649, + "grad_norm": 2.824475049972534, + "learning_rate": 4.974001399107816e-06, + "loss": 0.7181, + "step": 86 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 20.262290954589844, + "learning_rate": 4.973387162916415e-06, + "loss": 0.8599, + "step": 87 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 4.015744686126709, + "learning_rate": 4.972765794118158e-06, + "loss": 0.6081, + "step": 88 + }, + { + "epoch": 0.4810810810810811, + "grad_norm": 2.8033058643341064, + "learning_rate": 4.9721372945049114e-06, + "loss": 0.8764, + "step": 89 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 5.271846294403076, + "learning_rate": 4.971501665889107e-06, + "loss": 0.8622, + "step": 90 + }, + { + "epoch": 0.4918918918918919, + "grad_norm": 2.557264804840088, + "learning_rate": 4.9708589101037306e-06, + "loss": 0.5523, + "step": 91 + }, + { + "epoch": 0.4972972972972973, + "grad_norm": 4.342173099517822, + "learning_rate": 4.970209029002325e-06, + "loss": 0.8922, + "step": 92 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 2.950364351272583, + "learning_rate": 4.969552024458977e-06, + "loss": 0.9455, + "step": 93 + }, + { + "epoch": 0.5081081081081081, + "grad_norm": 2.6453042030334473, + "learning_rate": 4.968887898368318e-06, + "loss": 0.8342, + "step": 94 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 3.486766815185547, + "learning_rate": 4.968216652645515e-06, + "loss": 0.8476, + "step": 95 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 2.884152889251709, + "learning_rate": 4.967538289226268e-06, + "loss": 0.8879, + "step": 96 + }, + { + "epoch": 0.5243243243243243, + "grad_norm": 2.4130594730377197, + "learning_rate": 4.966852810066798e-06, + "loss": 0.7114, + "step": 97 + }, + { + "epoch": 0.5297297297297298, + "grad_norm": 3.182410955429077, + "learning_rate": 4.9661602171438524e-06, + "loss": 0.6757, + "step": 98 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 2.5027542114257812, + "learning_rate": 4.965460512454687e-06, + "loss": 0.8029, + "step": 99 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.3096024990081787, + "learning_rate": 4.964753698017071e-06, + "loss": 0.842, + "step": 100 + }, + { + "epoch": 0.5459459459459459, + "grad_norm": 2.875657081604004, + "learning_rate": 4.964039775869271e-06, + "loss": 0.6339, + "step": 101 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 2.505406141281128, + "learning_rate": 4.963318748070056e-06, + "loss": 0.7743, + "step": 102 + }, + { + "epoch": 0.5567567567567567, + "grad_norm": 3.552562713623047, + "learning_rate": 4.9625906166986815e-06, + "loss": 0.926, + "step": 103 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 2.717942476272583, + "learning_rate": 4.961855383854889e-06, + "loss": 0.7037, + "step": 104 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 2.5049386024475098, + "learning_rate": 4.961113051658901e-06, + "loss": 0.561, + "step": 105 + }, + { + "epoch": 0.572972972972973, + "grad_norm": 2.3112900257110596, + "learning_rate": 4.96036362225141e-06, + "loss": 0.7316, + "step": 106 + }, + { + "epoch": 0.5783783783783784, + "grad_norm": 2.470257520675659, + "learning_rate": 4.959607097793575e-06, + "loss": 0.6426, + "step": 107 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 3.8040788173675537, + "learning_rate": 4.9588434804670176e-06, + "loss": 1.0044, + "step": 108 + }, + { + "epoch": 0.5891891891891892, + "grad_norm": 3.143547296524048, + "learning_rate": 4.958072772473812e-06, + "loss": 0.9219, + "step": 109 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 3.5052590370178223, + "learning_rate": 4.9572949760364795e-06, + "loss": 0.6056, + "step": 110 + }, + { + "epoch": 0.6, + "grad_norm": 3.064009428024292, + "learning_rate": 4.9565100933979835e-06, + "loss": 0.6346, + "step": 111 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 2.694610595703125, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.9856, + "step": 112 + }, + { + "epoch": 0.6108108108108108, + "grad_norm": 2.5885775089263916, + "learning_rate": 4.954919078591521e-06, + "loss": 0.8669, + "step": 113 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 2.593609571456909, + "learning_rate": 4.954112951011628e-06, + "loss": 0.7201, + "step": 114 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 3.3045759201049805, + "learning_rate": 4.9532997464067065e-06, + "loss": 0.9095, + "step": 115 + }, + { + "epoch": 0.6270270270270271, + "grad_norm": 2.8144869804382324, + "learning_rate": 4.952479467121828e-06, + "loss": 1.0213, + "step": 116 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 2.5460312366485596, + "learning_rate": 4.951652115522463e-06, + "loss": 1.1154, + "step": 117 + }, + { + "epoch": 0.6378378378378379, + "grad_norm": 2.795137405395508, + "learning_rate": 4.950817693994481e-06, + "loss": 0.691, + "step": 118 + }, + { + "epoch": 0.6432432432432432, + "grad_norm": 2.4979195594787598, + "learning_rate": 4.949976204944135e-06, + "loss": 0.7224, + "step": 119 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 3.3131983280181885, + "learning_rate": 4.949127650798063e-06, + "loss": 0.9256, + "step": 120 + }, + { + "epoch": 0.654054054054054, + "grad_norm": 2.9060285091400146, + "learning_rate": 4.948272034003275e-06, + "loss": 0.6892, + "step": 121 + }, + { + "epoch": 0.6594594594594595, + "grad_norm": 3.695594549179077, + "learning_rate": 4.947409357027148e-06, + "loss": 0.5878, + "step": 122 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 3.1250460147857666, + "learning_rate": 4.9465396223574165e-06, + "loss": 0.9904, + "step": 123 + }, + { + "epoch": 0.6702702702702703, + "grad_norm": 4.024891376495361, + "learning_rate": 4.945662832502172e-06, + "loss": 1.1592, + "step": 124 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 2.6886494159698486, + "learning_rate": 4.944778989989847e-06, + "loss": 1.0041, + "step": 125 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 2.366912841796875, + "learning_rate": 4.943888097369216e-06, + "loss": 0.7045, + "step": 126 + }, + { + "epoch": 0.6864864864864865, + "grad_norm": 2.394932270050049, + "learning_rate": 4.942990157209381e-06, + "loss": 0.6685, + "step": 127 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 2.61933970451355, + "learning_rate": 4.9420851720997674e-06, + "loss": 0.8812, + "step": 128 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 2.7395646572113037, + "learning_rate": 4.94117314465012e-06, + "loss": 1.3014, + "step": 129 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 3.065484046936035, + "learning_rate": 4.940254077490487e-06, + "loss": 0.6978, + "step": 130 + }, + { + "epoch": 0.7081081081081081, + "grad_norm": 2.895038366317749, + "learning_rate": 4.939327973271222e-06, + "loss": 0.6249, + "step": 131 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 3.1773312091827393, + "learning_rate": 4.9383948346629665e-06, + "loss": 0.6423, + "step": 132 + }, + { + "epoch": 0.7189189189189189, + "grad_norm": 2.2378008365631104, + "learning_rate": 4.937454664356652e-06, + "loss": 0.7193, + "step": 133 + }, + { + "epoch": 0.7243243243243244, + "grad_norm": 2.5673701763153076, + "learning_rate": 4.9365074650634855e-06, + "loss": 0.7065, + "step": 134 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 2.7348387241363525, + "learning_rate": 4.9355532395149445e-06, + "loss": 1.0046, + "step": 135 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 2.391741991043091, + "learning_rate": 4.9345919904627655e-06, + "loss": 0.6771, + "step": 136 + }, + { + "epoch": 0.7405405405405405, + "grad_norm": 2.2096705436706543, + "learning_rate": 4.933623720678944e-06, + "loss": 0.6589, + "step": 137 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 3.0840072631835938, + "learning_rate": 4.932648432955718e-06, + "loss": 0.8755, + "step": 138 + }, + { + "epoch": 0.7513513513513513, + "grad_norm": 2.4970428943634033, + "learning_rate": 4.931666130105564e-06, + "loss": 0.6685, + "step": 139 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 4.315455436706543, + "learning_rate": 4.930676814961189e-06, + "loss": 0.8101, + "step": 140 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 5.388065814971924, + "learning_rate": 4.92968049037552e-06, + "loss": 0.8193, + "step": 141 + }, + { + "epoch": 0.7675675675675676, + "grad_norm": 2.6107139587402344, + "learning_rate": 4.9286771592217005e-06, + "loss": 0.7852, + "step": 142 + }, + { + "epoch": 0.772972972972973, + "grad_norm": 3.936556577682495, + "learning_rate": 4.927666824393076e-06, + "loss": 1.0388, + "step": 143 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 2.74424409866333, + "learning_rate": 4.926649488803191e-06, + "loss": 0.8266, + "step": 144 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 2.8998451232910156, + "learning_rate": 4.925625155385776e-06, + "loss": 0.4895, + "step": 145 + }, + { + "epoch": 0.7891891891891892, + "grad_norm": 3.0631520748138428, + "learning_rate": 4.924593827094743e-06, + "loss": 0.8759, + "step": 146 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 3.233267307281494, + "learning_rate": 4.923555506904176e-06, + "loss": 0.701, + "step": 147 + }, + { + "epoch": 0.8, + "grad_norm": 2.87701416015625, + "learning_rate": 4.922510197808321e-06, + "loss": 1.1327, + "step": 148 + }, + { + "epoch": 0.8054054054054054, + "grad_norm": 3.650576114654541, + "learning_rate": 4.921457902821578e-06, + "loss": 0.7587, + "step": 149 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 3.232112407684326, + "learning_rate": 4.920398624978493e-06, + "loss": 1.2158, + "step": 150 + }, + { + "epoch": 0.8162162162162162, + "grad_norm": 2.468384027481079, + "learning_rate": 4.919332367333748e-06, + "loss": 0.6852, + "step": 151 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 2.5947415828704834, + "learning_rate": 4.918259132962154e-06, + "loss": 0.6611, + "step": 152 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 3.0171427726745605, + "learning_rate": 4.917178924958638e-06, + "loss": 0.7327, + "step": 153 + }, + { + "epoch": 0.8324324324324325, + "grad_norm": 3.293184518814087, + "learning_rate": 4.916091746438243e-06, + "loss": 0.8528, + "step": 154 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 4.0570969581604, + "learning_rate": 4.9149976005361085e-06, + "loss": 0.9141, + "step": 155 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 2.8782784938812256, + "learning_rate": 4.913896490407467e-06, + "loss": 1.1132, + "step": 156 + }, + { + "epoch": 0.8486486486486486, + "grad_norm": 2.5671517848968506, + "learning_rate": 4.912788419227635e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.8540540540540541, + "grad_norm": 2.9445390701293945, + "learning_rate": 4.911673390192002e-06, + "loss": 0.9227, + "step": 158 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 2.472595453262329, + "learning_rate": 4.910551406516023e-06, + "loss": 0.8154, + "step": 159 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 2.5233397483825684, + "learning_rate": 4.909422471435207e-06, + "loss": 0.9897, + "step": 160 + }, + { + "epoch": 0.8702702702702703, + "grad_norm": 3.3919546604156494, + "learning_rate": 4.90828658820511e-06, + "loss": 0.6162, + "step": 161 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 3.060908555984497, + "learning_rate": 4.907143760101325e-06, + "loss": 0.5734, + "step": 162 + }, + { + "epoch": 0.8810810810810811, + "grad_norm": 3.4584782123565674, + "learning_rate": 4.905993990419472e-06, + "loss": 0.8328, + "step": 163 + }, + { + "epoch": 0.8864864864864865, + "grad_norm": 2.936570644378662, + "learning_rate": 4.904837282475187e-06, + "loss": 0.6787, + "step": 164 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 2.564837694168091, + "learning_rate": 4.9036736396041165e-06, + "loss": 0.9658, + "step": 165 + }, + { + "epoch": 0.8972972972972973, + "grad_norm": 3.2509360313415527, + "learning_rate": 4.902503065161905e-06, + "loss": 0.7899, + "step": 166 + }, + { + "epoch": 0.9027027027027027, + "grad_norm": 2.9730329513549805, + "learning_rate": 4.901325562524185e-06, + "loss": 0.9476, + "step": 167 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 3.044980049133301, + "learning_rate": 4.900141135086569e-06, + "loss": 0.7589, + "step": 168 + }, + { + "epoch": 0.9135135135135135, + "grad_norm": 3.030585527420044, + "learning_rate": 4.898949786264638e-06, + "loss": 0.6724, + "step": 169 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 2.249122142791748, + "learning_rate": 4.897751519493933e-06, + "loss": 0.6968, + "step": 170 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 2.9816982746124268, + "learning_rate": 4.896546338229945e-06, + "loss": 0.7984, + "step": 171 + }, + { + "epoch": 0.9297297297297298, + "grad_norm": 2.415736675262451, + "learning_rate": 4.8953342459481034e-06, + "loss": 0.6109, + "step": 172 + }, + { + "epoch": 0.9351351351351351, + "grad_norm": 2.740518808364868, + "learning_rate": 4.894115246143768e-06, + "loss": 0.8126, + "step": 173 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 2.7610201835632324, + "learning_rate": 4.892889342332218e-06, + "loss": 0.6862, + "step": 174 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 3.057025194168091, + "learning_rate": 4.891656538048642e-06, + "loss": 0.9895, + "step": 175 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 2.569751262664795, + "learning_rate": 4.890416836848128e-06, + "loss": 0.8481, + "step": 176 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 2.4443397521972656, + "learning_rate": 4.889170242305652e-06, + "loss": 0.6478, + "step": 177 + }, + { + "epoch": 0.9621621621621622, + "grad_norm": 2.5009846687316895, + "learning_rate": 4.887916758016069e-06, + "loss": 0.9714, + "step": 178 + }, + { + "epoch": 0.9675675675675676, + "grad_norm": 3.101975202560425, + "learning_rate": 4.886656387594104e-06, + "loss": 1.1264, + "step": 179 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 2.6144704818725586, + "learning_rate": 4.885389134674338e-06, + "loss": 0.7664, + "step": 180 + }, + { + "epoch": 0.9783783783783784, + "grad_norm": 2.5834381580352783, + "learning_rate": 4.884115002911197e-06, + "loss": 0.6131, + "step": 181 + }, + { + "epoch": 0.9837837837837838, + "grad_norm": 2.5378055572509766, + "learning_rate": 4.88283399597895e-06, + "loss": 0.8733, + "step": 182 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 2.4095377922058105, + "learning_rate": 4.881546117571686e-06, + "loss": 0.643, + "step": 183 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 2.9554507732391357, + "learning_rate": 4.8802513714033135e-06, + "loss": 0.7287, + "step": 184 + }, + { + "epoch": 1.0, + "grad_norm": 2.8279213905334473, + "learning_rate": 4.878949761207545e-06, + "loss": 0.9927, + "step": 185 + }, + { + "epoch": 1.0054054054054054, + "grad_norm": 2.9361412525177, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.66, + "step": 186 + }, + { + "epoch": 1.0108108108108107, + "grad_norm": 3.392244338989258, + "learning_rate": 4.876325963767623e-06, + "loss": 0.594, + "step": 187 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 2.6276044845581055, + "learning_rate": 4.875003784089822e-06, + "loss": 0.5825, + "step": 188 + }, + { + "epoch": 1.0216216216216216, + "grad_norm": 2.2875545024871826, + "learning_rate": 4.873674755517305e-06, + "loss": 0.6594, + "step": 189 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 2.8086795806884766, + "learning_rate": 4.872338881882645e-06, + "loss": 0.7536, + "step": 190 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 2.3685200214385986, + "learning_rate": 4.870996167038154e-06, + "loss": 0.4849, + "step": 191 + }, + { + "epoch": 1.037837837837838, + "grad_norm": 3.0264766216278076, + "learning_rate": 4.869646614855877e-06, + "loss": 0.3771, + "step": 192 + }, + { + "epoch": 1.0432432432432432, + "grad_norm": 4.335122108459473, + "learning_rate": 4.868290229227567e-06, + "loss": 0.8545, + "step": 193 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 3.442172050476074, + "learning_rate": 4.866927014064692e-06, + "loss": 0.3698, + "step": 194 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 3.326539993286133, + "learning_rate": 4.86555697329841e-06, + "loss": 0.8468, + "step": 195 + }, + { + "epoch": 1.0594594594594595, + "grad_norm": 3.0372447967529297, + "learning_rate": 4.864180110879562e-06, + "loss": 0.8232, + "step": 196 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 2.955343008041382, + "learning_rate": 4.862796430778663e-06, + "loss": 0.4097, + "step": 197 + }, + { + "epoch": 1.0702702702702702, + "grad_norm": 2.4095399379730225, + "learning_rate": 4.861405936985889e-06, + "loss": 0.6746, + "step": 198 + }, + { + "epoch": 1.0756756756756758, + "grad_norm": 2.763500452041626, + "learning_rate": 4.860008633511059e-06, + "loss": 0.6605, + "step": 199 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 2.6751155853271484, + "learning_rate": 4.8586045243836384e-06, + "loss": 0.471, + "step": 200 + }, + { + "epoch": 1.0864864864864865, + "grad_norm": 3.3507862091064453, + "learning_rate": 4.857193613652711e-06, + "loss": 0.7665, + "step": 201 + }, + { + "epoch": 1.0918918918918918, + "grad_norm": 3.3064827919006348, + "learning_rate": 4.8557759053869775e-06, + "loss": 0.6436, + "step": 202 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 2.571828603744507, + "learning_rate": 4.854351403674741e-06, + "loss": 0.4642, + "step": 203 + }, + { + "epoch": 1.1027027027027028, + "grad_norm": 2.883220911026001, + "learning_rate": 4.852920112623895e-06, + "loss": 0.5737, + "step": 204 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 3.026144027709961, + "learning_rate": 4.851482036361912e-06, + "loss": 0.7302, + "step": 205 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 2.6689612865448, + "learning_rate": 4.850037179035829e-06, + "loss": 0.5229, + "step": 206 + }, + { + "epoch": 1.118918918918919, + "grad_norm": 2.4019956588745117, + "learning_rate": 4.8485855448122425e-06, + "loss": 0.5529, + "step": 207 + }, + { + "epoch": 1.1243243243243244, + "grad_norm": 2.3546230792999268, + "learning_rate": 4.847127137877286e-06, + "loss": 0.3635, + "step": 208 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 2.999096393585205, + "learning_rate": 4.8456619624366285e-06, + "loss": 0.8149, + "step": 209 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 10.072900772094727, + "learning_rate": 4.844190022715456e-06, + "loss": 0.8333, + "step": 210 + }, + { + "epoch": 1.1405405405405404, + "grad_norm": 2.222123384475708, + "learning_rate": 4.84271132295846e-06, + "loss": 0.3717, + "step": 211 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 2.8751113414764404, + "learning_rate": 4.841225867429826e-06, + "loss": 0.5994, + "step": 212 + }, + { + "epoch": 1.1513513513513514, + "grad_norm": 2.9580111503601074, + "learning_rate": 4.839733660413224e-06, + "loss": 0.8382, + "step": 213 + }, + { + "epoch": 1.1567567567567567, + "grad_norm": 4.628892421722412, + "learning_rate": 4.838234706211792e-06, + "loss": 0.818, + "step": 214 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 2.5103509426116943, + "learning_rate": 4.836729009148124e-06, + "loss": 0.4267, + "step": 215 + }, + { + "epoch": 1.1675675675675676, + "grad_norm": 2.6093738079071045, + "learning_rate": 4.835216573564261e-06, + "loss": 0.3472, + "step": 216 + }, + { + "epoch": 1.172972972972973, + "grad_norm": 3.0792338848114014, + "learning_rate": 4.833697403821672e-06, + "loss": 0.6323, + "step": 217 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 2.845163345336914, + "learning_rate": 4.8321715043012516e-06, + "loss": 0.6831, + "step": 218 + }, + { + "epoch": 1.1837837837837837, + "grad_norm": 3.0433948040008545, + "learning_rate": 4.830638879403296e-06, + "loss": 0.3682, + "step": 219 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 2.6533594131469727, + "learning_rate": 4.8290995335475e-06, + "loss": 0.4154, + "step": 220 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 2.9271352291107178, + "learning_rate": 4.827553471172935e-06, + "loss": 0.3991, + "step": 221 + }, + { + "epoch": 1.2, + "grad_norm": 2.9243528842926025, + "learning_rate": 4.826000696738045e-06, + "loss": 0.4538, + "step": 222 + }, + { + "epoch": 1.2054054054054055, + "grad_norm": 2.537332534790039, + "learning_rate": 4.824441214720629e-06, + "loss": 0.7692, + "step": 223 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 3.9193246364593506, + "learning_rate": 4.8228750296178275e-06, + "loss": 0.6038, + "step": 224 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 2.6646728515625, + "learning_rate": 4.821302145946113e-06, + "loss": 0.4147, + "step": 225 + }, + { + "epoch": 1.2216216216216216, + "grad_norm": 2.6519482135772705, + "learning_rate": 4.819722568241274e-06, + "loss": 0.5398, + "step": 226 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 2.2018048763275146, + "learning_rate": 4.818136301058401e-06, + "loss": 0.3864, + "step": 227 + }, + { + "epoch": 1.2324324324324325, + "grad_norm": 2.5660712718963623, + "learning_rate": 4.816543348971879e-06, + "loss": 0.5712, + "step": 228 + }, + { + "epoch": 1.2378378378378379, + "grad_norm": 3.237663745880127, + "learning_rate": 4.814943716575368e-06, + "loss": 0.662, + "step": 229 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 2.5570430755615234, + "learning_rate": 4.813337408481793e-06, + "loss": 0.8661, + "step": 230 + }, + { + "epoch": 1.2486486486486488, + "grad_norm": 2.9231269359588623, + "learning_rate": 4.811724429323329e-06, + "loss": 0.9218, + "step": 231 + }, + { + "epoch": 1.2540540540540541, + "grad_norm": 3.637084722518921, + "learning_rate": 4.810104783751389e-06, + "loss": 0.5597, + "step": 232 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 3.0218842029571533, + "learning_rate": 4.8084784764366125e-06, + "loss": 0.4786, + "step": 233 + }, + { + "epoch": 1.2648648648648648, + "grad_norm": 2.770214080810547, + "learning_rate": 4.806845512068846e-06, + "loss": 0.5219, + "step": 234 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 3.093053102493286, + "learning_rate": 4.805205895357137e-06, + "loss": 0.643, + "step": 235 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 2.6373348236083984, + "learning_rate": 4.803559631029713e-06, + "loss": 0.5858, + "step": 236 + }, + { + "epoch": 1.281081081081081, + "grad_norm": 2.452030897140503, + "learning_rate": 4.801906723833973e-06, + "loss": 0.4185, + "step": 237 + }, + { + "epoch": 1.2864864864864864, + "grad_norm": 2.72564697265625, + "learning_rate": 4.8002471785364734e-06, + "loss": 0.4917, + "step": 238 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 3.0389158725738525, + "learning_rate": 4.798580999922913e-06, + "loss": 0.645, + "step": 239 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 3.7002289295196533, + "learning_rate": 4.796908192798117e-06, + "loss": 0.5378, + "step": 240 + }, + { + "epoch": 1.3027027027027027, + "grad_norm": 2.1876111030578613, + "learning_rate": 4.7952287619860276e-06, + "loss": 0.5197, + "step": 241 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 3.903337240219116, + "learning_rate": 4.793542712329689e-06, + "loss": 1.0226, + "step": 242 + }, + { + "epoch": 1.3135135135135134, + "grad_norm": 2.3623552322387695, + "learning_rate": 4.791850048691228e-06, + "loss": 0.5502, + "step": 243 + }, + { + "epoch": 1.318918918918919, + "grad_norm": 3.0669031143188477, + "learning_rate": 4.79015077595185e-06, + "loss": 0.6976, + "step": 244 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 3.1480472087860107, + "learning_rate": 4.788444899011816e-06, + "loss": 0.4795, + "step": 245 + }, + { + "epoch": 1.3297297297297297, + "grad_norm": 3.7051920890808105, + "learning_rate": 4.786732422790432e-06, + "loss": 0.6526, + "step": 246 + }, + { + "epoch": 1.3351351351351353, + "grad_norm": 3.4358389377593994, + "learning_rate": 4.785013352226036e-06, + "loss": 0.5551, + "step": 247 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 2.3789355754852295, + "learning_rate": 4.7832876922759805e-06, + "loss": 0.3151, + "step": 248 + }, + { + "epoch": 1.345945945945946, + "grad_norm": 2.4843716621398926, + "learning_rate": 4.781555447916622e-06, + "loss": 0.6713, + "step": 249 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 3.0176303386688232, + "learning_rate": 4.779816624143302e-06, + "loss": 0.437, + "step": 250 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 2.868350028991699, + "learning_rate": 4.77807122597034e-06, + "loss": 0.7632, + "step": 251 + }, + { + "epoch": 1.3621621621621622, + "grad_norm": 2.4629738330841064, + "learning_rate": 4.776319258431009e-06, + "loss": 0.4894, + "step": 252 + }, + { + "epoch": 1.3675675675675676, + "grad_norm": 2.798297882080078, + "learning_rate": 4.77456072657753e-06, + "loss": 0.4456, + "step": 253 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 3.2977547645568848, + "learning_rate": 4.772795635481053e-06, + "loss": 0.5381, + "step": 254 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 4.1061906814575195, + "learning_rate": 4.77102399023164e-06, + "loss": 1.0302, + "step": 255 + }, + { + "epoch": 1.3837837837837839, + "grad_norm": 3.943284511566162, + "learning_rate": 4.769245795938261e-06, + "loss": 0.4875, + "step": 256 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 2.6420533657073975, + "learning_rate": 4.767461057728763e-06, + "loss": 0.4923, + "step": 257 + }, + { + "epoch": 1.3945945945945946, + "grad_norm": 3.3152263164520264, + "learning_rate": 4.76566978074987e-06, + "loss": 0.6699, + "step": 258 + }, + { + "epoch": 1.4, + "grad_norm": 2.6928882598876953, + "learning_rate": 4.7638719701671586e-06, + "loss": 0.6117, + "step": 259 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 2.706597328186035, + "learning_rate": 4.762067631165049e-06, + "loss": 0.8534, + "step": 260 + }, + { + "epoch": 1.4108108108108108, + "grad_norm": 2.9912848472595215, + "learning_rate": 4.760256768946787e-06, + "loss": 0.5057, + "step": 261 + }, + { + "epoch": 1.4162162162162162, + "grad_norm": 2.7098443508148193, + "learning_rate": 4.758439388734429e-06, + "loss": 0.7286, + "step": 262 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 3.1288092136383057, + "learning_rate": 4.7566154957688276e-06, + "loss": 0.9827, + "step": 263 + }, + { + "epoch": 1.427027027027027, + "grad_norm": 3.0505919456481934, + "learning_rate": 4.754785095309617e-06, + "loss": 0.7042, + "step": 264 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 2.6800339221954346, + "learning_rate": 4.752948192635199e-06, + "loss": 0.5179, + "step": 265 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 2.2246861457824707, + "learning_rate": 4.751104793042722e-06, + "loss": 0.8527, + "step": 266 + }, + { + "epoch": 1.4432432432432432, + "grad_norm": 2.4242751598358154, + "learning_rate": 4.7492549018480725e-06, + "loss": 0.5627, + "step": 267 + }, + { + "epoch": 1.4486486486486487, + "grad_norm": 2.763244152069092, + "learning_rate": 4.747398524385858e-06, + "loss": 0.8981, + "step": 268 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 2.856595993041992, + "learning_rate": 4.745535666009389e-06, + "loss": 0.5455, + "step": 269 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 2.4168624877929688, + "learning_rate": 4.743666332090664e-06, + "loss": 0.4348, + "step": 270 + }, + { + "epoch": 1.464864864864865, + "grad_norm": 2.5408060550689697, + "learning_rate": 4.74179052802036e-06, + "loss": 0.5524, + "step": 271 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 2.6216673851013184, + "learning_rate": 4.739908259207807e-06, + "loss": 0.7469, + "step": 272 + }, + { + "epoch": 1.4756756756756757, + "grad_norm": 5.397300720214844, + "learning_rate": 4.738019531080981e-06, + "loss": 0.7216, + "step": 273 + }, + { + "epoch": 1.481081081081081, + "grad_norm": 3.3481080532073975, + "learning_rate": 4.7361243490864825e-06, + "loss": 0.7527, + "step": 274 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 2.7943873405456543, + "learning_rate": 4.734222718689527e-06, + "loss": 0.7437, + "step": 275 + }, + { + "epoch": 1.491891891891892, + "grad_norm": 2.206890344619751, + "learning_rate": 4.732314645373922e-06, + "loss": 0.5187, + "step": 276 + }, + { + "epoch": 1.4972972972972973, + "grad_norm": 2.76442813873291, + "learning_rate": 4.730400134642055e-06, + "loss": 0.7186, + "step": 277 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 3.4754087924957275, + "learning_rate": 4.728479192014879e-06, + "loss": 0.9655, + "step": 278 + }, + { + "epoch": 1.5081081081081082, + "grad_norm": 2.923779249191284, + "learning_rate": 4.726551823031895e-06, + "loss": 0.6251, + "step": 279 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 3.1142773628234863, + "learning_rate": 4.7246180332511335e-06, + "loss": 0.4805, + "step": 280 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 2.3477070331573486, + "learning_rate": 4.722677828249142e-06, + "loss": 1.0939, + "step": 281 + }, + { + "epoch": 1.5243243243243243, + "grad_norm": 2.8418569564819336, + "learning_rate": 4.720731213620972e-06, + "loss": 0.9485, + "step": 282 + }, + { + "epoch": 1.5297297297297296, + "grad_norm": 2.462710380554199, + "learning_rate": 4.718778194980152e-06, + "loss": 0.5805, + "step": 283 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 3.2379209995269775, + "learning_rate": 4.7168187779586805e-06, + "loss": 0.77, + "step": 284 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 3.0701661109924316, + "learning_rate": 4.71485296820701e-06, + "loss": 0.5932, + "step": 285 + }, + { + "epoch": 1.545945945945946, + "grad_norm": 4.099547386169434, + "learning_rate": 4.7128807713940245e-06, + "loss": 0.6296, + "step": 286 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 2.5529167652130127, + "learning_rate": 4.710902193207028e-06, + "loss": 0.6201, + "step": 287 + }, + { + "epoch": 1.5567567567567568, + "grad_norm": 2.794926881790161, + "learning_rate": 4.708917239351727e-06, + "loss": 0.5682, + "step": 288 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 3.2522501945495605, + "learning_rate": 4.706925915552214e-06, + "loss": 0.8877, + "step": 289 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 2.811847448348999, + "learning_rate": 4.704928227550949e-06, + "loss": 0.6521, + "step": 290 + }, + { + "epoch": 1.572972972972973, + "grad_norm": 2.7060673236846924, + "learning_rate": 4.702924181108745e-06, + "loss": 0.4929, + "step": 291 + }, + { + "epoch": 1.5783783783783782, + "grad_norm": 2.5009031295776367, + "learning_rate": 4.700913782004755e-06, + "loss": 0.4515, + "step": 292 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 2.6722700595855713, + "learning_rate": 4.698897036036446e-06, + "loss": 0.5477, + "step": 293 + }, + { + "epoch": 1.5891891891891892, + "grad_norm": 3.3333957195281982, + "learning_rate": 4.696873949019591e-06, + "loss": 0.9589, + "step": 294 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 2.4862897396087646, + "learning_rate": 4.694844526788248e-06, + "loss": 0.4425, + "step": 295 + }, + { + "epoch": 1.6, + "grad_norm": 2.78708553314209, + "learning_rate": 4.692808775194745e-06, + "loss": 0.4899, + "step": 296 + }, + { + "epoch": 1.6054054054054054, + "grad_norm": 2.9121289253234863, + "learning_rate": 4.690766700109659e-06, + "loss": 0.4884, + "step": 297 + }, + { + "epoch": 1.6108108108108108, + "grad_norm": 4.692054271697998, + "learning_rate": 4.688718307421807e-06, + "loss": 0.8977, + "step": 298 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 3.1290926933288574, + "learning_rate": 4.686663603038222e-06, + "loss": 0.6833, + "step": 299 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 3.5091123580932617, + "learning_rate": 4.6846025928841365e-06, + "loss": 0.9141, + "step": 300 + }, + { + "epoch": 1.627027027027027, + "grad_norm": 2.5466184616088867, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.5121, + "step": 301 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 2.7833092212677, + "learning_rate": 4.68046167905631e-06, + "loss": 0.5399, + "step": 302 + }, + { + "epoch": 1.637837837837838, + "grad_norm": 3.05135440826416, + "learning_rate": 4.678381787323889e-06, + "loss": 0.7921, + "step": 303 + }, + { + "epoch": 1.6432432432432433, + "grad_norm": 2.2391726970672607, + "learning_rate": 4.676295613703577e-06, + "loss": 0.7178, + "step": 304 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 2.3654022216796875, + "learning_rate": 4.674203164211357e-06, + "loss": 0.7162, + "step": 305 + }, + { + "epoch": 1.654054054054054, + "grad_norm": 2.436009645462036, + "learning_rate": 4.67210444488131e-06, + "loss": 0.6539, + "step": 306 + }, + { + "epoch": 1.6594594594594594, + "grad_norm": 2.6034209728240967, + "learning_rate": 4.669999461765599e-06, + "loss": 0.7214, + "step": 307 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 2.804229497909546, + "learning_rate": 4.6678882209344474e-06, + "loss": 0.7451, + "step": 308 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 2.6239655017852783, + "learning_rate": 4.665770728476127e-06, + "loss": 0.6464, + "step": 309 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 2.9320099353790283, + "learning_rate": 4.663646990496939e-06, + "loss": 0.6669, + "step": 310 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 3.09713077545166, + "learning_rate": 4.661517013121189e-06, + "loss": 0.8972, + "step": 311 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 3.6576132774353027, + "learning_rate": 4.659380802491181e-06, + "loss": 0.6286, + "step": 312 + }, + { + "epoch": 1.691891891891892, + "grad_norm": 2.9320433139801025, + "learning_rate": 4.6572383647671915e-06, + "loss": 0.3631, + "step": 313 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 3.399357557296753, + "learning_rate": 4.655089706127457e-06, + "loss": 0.5682, + "step": 314 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 2.7667412757873535, + "learning_rate": 4.652934832768148e-06, + "loss": 0.5457, + "step": 315 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 2.3023321628570557, + "learning_rate": 4.650773750903363e-06, + "loss": 0.6601, + "step": 316 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 2.6584670543670654, + "learning_rate": 4.6486064667651005e-06, + "loss": 0.5882, + "step": 317 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 5.528168678283691, + "learning_rate": 4.646432986603245e-06, + "loss": 0.7628, + "step": 318 + }, + { + "epoch": 1.7243243243243245, + "grad_norm": 3.054884195327759, + "learning_rate": 4.644253316685552e-06, + "loss": 0.6877, + "step": 319 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 3.2672388553619385, + "learning_rate": 4.6420674632976205e-06, + "loss": 0.7026, + "step": 320 + }, + { + "epoch": 1.7351351351351352, + "grad_norm": 3.109384536743164, + "learning_rate": 4.639875432742886e-06, + "loss": 0.5236, + "step": 321 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 3.3593883514404297, + "learning_rate": 4.6376772313425975e-06, + "loss": 0.6463, + "step": 322 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 2.6352698802948, + "learning_rate": 4.635472865435795e-06, + "loss": 0.6903, + "step": 323 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 2.751690149307251, + "learning_rate": 4.6332623413792995e-06, + "loss": 0.7342, + "step": 324 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 2.670915126800537, + "learning_rate": 4.6310456655476874e-06, + "loss": 0.4302, + "step": 325 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 2.7648138999938965, + "learning_rate": 4.6288228443332786e-06, + "loss": 0.5108, + "step": 326 + }, + { + "epoch": 1.7675675675675677, + "grad_norm": 2.7451536655426025, + "learning_rate": 4.626593884146111e-06, + "loss": 0.7646, + "step": 327 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 2.4656403064727783, + "learning_rate": 4.624358791413928e-06, + "loss": 0.5529, + "step": 328 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 2.5987517833709717, + "learning_rate": 4.622117572582159e-06, + "loss": 0.609, + "step": 329 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 3.3843371868133545, + "learning_rate": 4.619870234113894e-06, + "loss": 0.9146, + "step": 330 + }, + { + "epoch": 1.7891891891891891, + "grad_norm": 2.3542068004608154, + "learning_rate": 4.617616782489878e-06, + "loss": 0.6887, + "step": 331 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 2.2049715518951416, + "learning_rate": 4.615357224208477e-06, + "loss": 0.505, + "step": 332 + }, + { + "epoch": 1.8, + "grad_norm": 2.453920364379883, + "learning_rate": 4.613091565785674e-06, + "loss": 0.8384, + "step": 333 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 2.5751583576202393, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5512, + "step": 334 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 2.524075984954834, + "learning_rate": 4.608541974667714e-06, + "loss": 0.4877, + "step": 335 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 2.2856955528259277, + "learning_rate": 4.606258055092397e-06, + "loss": 0.5583, + "step": 336 + }, + { + "epoch": 1.8216216216216217, + "grad_norm": 2.2773683071136475, + "learning_rate": 4.603968061615321e-06, + "loss": 0.5421, + "step": 337 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 4.085512161254883, + "learning_rate": 4.601672000840231e-06, + "loss": 0.942, + "step": 338 + }, + { + "epoch": 1.8324324324324324, + "grad_norm": 2.3710968494415283, + "learning_rate": 4.5993698793883715e-06, + "loss": 0.3773, + "step": 339 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 2.745534658432007, + "learning_rate": 4.597061703898462e-06, + "loss": 0.9694, + "step": 340 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 2.463207244873047, + "learning_rate": 4.594747481026685e-06, + "loss": 0.4667, + "step": 341 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 2.7216601371765137, + "learning_rate": 4.592427217446656e-06, + "loss": 0.4267, + "step": 342 + }, + { + "epoch": 1.8540540540540542, + "grad_norm": 2.545664072036743, + "learning_rate": 4.590100919849413e-06, + "loss": 0.9245, + "step": 343 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 3.692840337753296, + "learning_rate": 4.587768594943396e-06, + "loss": 0.7502, + "step": 344 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 2.993229627609253, + "learning_rate": 4.585430249454426e-06, + "loss": 0.4689, + "step": 345 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 2.162867546081543, + "learning_rate": 4.583085890125682e-06, + "loss": 0.6188, + "step": 346 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 2.2169792652130127, + "learning_rate": 4.5807355237176896e-06, + "loss": 0.6352, + "step": 347 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 3.978985548019409, + "learning_rate": 4.578379157008296e-06, + "loss": 0.464, + "step": 348 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 2.236682653427124, + "learning_rate": 4.57601679679265e-06, + "loss": 0.5943, + "step": 349 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 2.528754472732544, + "learning_rate": 4.573648449883188e-06, + "loss": 0.6949, + "step": 350 + }, + { + "epoch": 1.8972972972972975, + "grad_norm": 2.7673721313476562, + "learning_rate": 4.571274123109606e-06, + "loss": 0.4333, + "step": 351 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 2.698012351989746, + "learning_rate": 4.568893823318847e-06, + "loss": 0.6796, + "step": 352 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 2.9640560150146484, + "learning_rate": 4.566507557375077e-06, + "loss": 0.6139, + "step": 353 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 2.417628526687622, + "learning_rate": 4.5641153321596684e-06, + "loss": 0.4515, + "step": 354 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 2.676739454269409, + "learning_rate": 4.56171715457118e-06, + "loss": 0.8426, + "step": 355 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 2.8428189754486084, + "learning_rate": 4.559313031525331e-06, + "loss": 0.5806, + "step": 356 + }, + { + "epoch": 1.9297297297297298, + "grad_norm": 2.6817944049835205, + "learning_rate": 4.55690296995499e-06, + "loss": 0.5927, + "step": 357 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 3.5939931869506836, + "learning_rate": 4.554486976810149e-06, + "loss": 0.9986, + "step": 358 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 2.86688494682312, + "learning_rate": 4.552065059057906e-06, + "loss": 0.6813, + "step": 359 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 2.9295246601104736, + "learning_rate": 4.549637223682441e-06, + "loss": 1.0832, + "step": 360 + }, + { + "epoch": 1.9513513513513514, + "grad_norm": 2.6939451694488525, + "learning_rate": 4.547203477685005e-06, + "loss": 0.7377, + "step": 361 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 2.226055145263672, + "learning_rate": 4.544763828083888e-06, + "loss": 0.5412, + "step": 362 + }, + { + "epoch": 1.962162162162162, + "grad_norm": 2.490187406539917, + "learning_rate": 4.542318281914405e-06, + "loss": 0.6955, + "step": 363 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 2.9241302013397217, + "learning_rate": 4.53986684622888e-06, + "loss": 0.6774, + "step": 364 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 2.988084554672241, + "learning_rate": 4.537409528096615e-06, + "loss": 0.5832, + "step": 365 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 2.9380626678466797, + "learning_rate": 4.534946334603879e-06, + "loss": 0.606, + "step": 366 + }, + { + "epoch": 1.983783783783784, + "grad_norm": 2.667588710784912, + "learning_rate": 4.532477272853882e-06, + "loss": 0.4991, + "step": 367 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 2.9711899757385254, + "learning_rate": 4.530002349966759e-06, + "loss": 0.4442, + "step": 368 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 3.443957805633545, + "learning_rate": 4.5275215730795445e-06, + "loss": 0.6566, + "step": 369 + }, + { + "epoch": 2.0, + "grad_norm": 3.590317487716675, + "learning_rate": 4.525034949346156e-06, + "loss": 0.5687, + "step": 370 + }, + { + "epoch": 2.0054054054054054, + "grad_norm": 3.678600549697876, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4458, + "step": 371 + }, + { + "epoch": 2.0108108108108107, + "grad_norm": 3.803563356399536, + "learning_rate": 4.5200441900408045e-06, + "loss": 0.4418, + "step": 372 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 2.9187233448028564, + "learning_rate": 4.517540068860898e-06, + "loss": 0.7057, + "step": 373 + }, + { + "epoch": 2.0216216216216214, + "grad_norm": 2.693603515625, + "learning_rate": 4.515030129618884e-06, + "loss": 0.4491, + "step": 374 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 2.3883047103881836, + "learning_rate": 4.512514379552779e-06, + "loss": 0.3571, + "step": 375 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 4.558557033538818, + "learning_rate": 4.509992825917352e-06, + "loss": 0.5056, + "step": 376 + }, + { + "epoch": 2.037837837837838, + "grad_norm": 3.9574761390686035, + "learning_rate": 4.507465475984109e-06, + "loss": 0.6834, + "step": 377 + }, + { + "epoch": 2.0432432432432432, + "grad_norm": 5.34630012512207, + "learning_rate": 4.504932337041272e-06, + "loss": 0.6726, + "step": 378 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 3.198740243911743, + "learning_rate": 4.502393416393757e-06, + "loss": 0.4032, + "step": 379 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 3.347480297088623, + "learning_rate": 4.4998487213631515e-06, + "loss": 0.5442, + "step": 380 + }, + { + "epoch": 2.0594594594594593, + "grad_norm": 3.940531015396118, + "learning_rate": 4.497298259287696e-06, + "loss": 0.6181, + "step": 381 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 3.0910496711730957, + "learning_rate": 4.494742037522261e-06, + "loss": 0.3829, + "step": 382 + }, + { + "epoch": 2.0702702702702704, + "grad_norm": 4.060451984405518, + "learning_rate": 4.4921800634383295e-06, + "loss": 0.4953, + "step": 383 + }, + { + "epoch": 2.075675675675676, + "grad_norm": 3.1667511463165283, + "learning_rate": 4.4896123444239655e-06, + "loss": 0.3254, + "step": 384 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 3.0239670276641846, + "learning_rate": 4.487038887883809e-06, + "loss": 0.555, + "step": 385 + }, + { + "epoch": 2.0864864864864865, + "grad_norm": 2.8815383911132812, + "learning_rate": 4.484459701239038e-06, + "loss": 0.665, + "step": 386 + }, + { + "epoch": 2.091891891891892, + "grad_norm": 3.615537166595459, + "learning_rate": 4.481874791927358e-06, + "loss": 0.2652, + "step": 387 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 3.407407283782959, + "learning_rate": 4.479284167402977e-06, + "loss": 0.3811, + "step": 388 + }, + { + "epoch": 2.1027027027027025, + "grad_norm": 2.6651623249053955, + "learning_rate": 4.476687835136585e-06, + "loss": 0.2463, + "step": 389 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 3.5145862102508545, + "learning_rate": 4.47408580261533e-06, + "loss": 0.5507, + "step": 390 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 3.0952725410461426, + "learning_rate": 4.471478077342798e-06, + "loss": 0.288, + "step": 391 + }, + { + "epoch": 2.118918918918919, + "grad_norm": 2.634775400161743, + "learning_rate": 4.468864666838994e-06, + "loss": 0.5169, + "step": 392 + }, + { + "epoch": 2.1243243243243244, + "grad_norm": 3.7388594150543213, + "learning_rate": 4.4662455786403125e-06, + "loss": 0.3327, + "step": 393 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 3.8197360038757324, + "learning_rate": 4.463620820299528e-06, + "loss": 0.3877, + "step": 394 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 3.0073485374450684, + "learning_rate": 4.4609903993857606e-06, + "loss": 0.5425, + "step": 395 + }, + { + "epoch": 2.1405405405405404, + "grad_norm": 2.6923868656158447, + "learning_rate": 4.458354323484462e-06, + "loss": 0.5257, + "step": 396 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 3.2151331901550293, + "learning_rate": 4.45571260019739e-06, + "loss": 0.3914, + "step": 397 + }, + { + "epoch": 2.1513513513513516, + "grad_norm": 3.4031248092651367, + "learning_rate": 4.453065237142592e-06, + "loss": 0.3455, + "step": 398 + }, + { + "epoch": 2.156756756756757, + "grad_norm": 3.012275457382202, + "learning_rate": 4.4504122419543745e-06, + "loss": 0.4652, + "step": 399 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 3.3084208965301514, + "learning_rate": 4.4477536222832865e-06, + "loss": 0.6343, + "step": 400 + }, + { + "epoch": 2.1675675675675676, + "grad_norm": 3.115206241607666, + "learning_rate": 4.445089385796099e-06, + "loss": 0.6975, + "step": 401 + }, + { + "epoch": 2.172972972972973, + "grad_norm": 2.893930435180664, + "learning_rate": 4.442419540175778e-06, + "loss": 0.5779, + "step": 402 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 3.0549168586730957, + "learning_rate": 4.439744093121465e-06, + "loss": 0.4541, + "step": 403 + }, + { + "epoch": 2.1837837837837837, + "grad_norm": 3.1189024448394775, + "learning_rate": 4.437063052348457e-06, + "loss": 0.4078, + "step": 404 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 6.644659042358398, + "learning_rate": 4.434376425588179e-06, + "loss": 0.6759, + "step": 405 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 2.807554006576538, + "learning_rate": 4.431684220588163e-06, + "loss": 0.2938, + "step": 406 + }, + { + "epoch": 2.2, + "grad_norm": 3.6900999546051025, + "learning_rate": 4.428986445112034e-06, + "loss": 0.676, + "step": 407 + }, + { + "epoch": 2.2054054054054055, + "grad_norm": 2.0721664428710938, + "learning_rate": 4.426283106939474e-06, + "loss": 0.1859, + "step": 408 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 2.953388214111328, + "learning_rate": 4.423574213866209e-06, + "loss": 0.2955, + "step": 409 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 3.049050807952881, + "learning_rate": 4.420859773703985e-06, + "loss": 0.2262, + "step": 410 + }, + { + "epoch": 2.2216216216216216, + "grad_norm": 3.319796323776245, + "learning_rate": 4.418139794280542e-06, + "loss": 0.2273, + "step": 411 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 2.4133522510528564, + "learning_rate": 4.415414283439595e-06, + "loss": 0.3282, + "step": 412 + }, + { + "epoch": 2.2324324324324323, + "grad_norm": 2.9842193126678467, + "learning_rate": 4.4126832490408116e-06, + "loss": 0.3651, + "step": 413 + }, + { + "epoch": 2.237837837837838, + "grad_norm": 2.759531259536743, + "learning_rate": 4.409946698959784e-06, + "loss": 0.4052, + "step": 414 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 3.045485019683838, + "learning_rate": 4.4072046410880145e-06, + "loss": 0.4638, + "step": 415 + }, + { + "epoch": 2.2486486486486488, + "grad_norm": 3.0058295726776123, + "learning_rate": 4.404457083332887e-06, + "loss": 0.517, + "step": 416 + }, + { + "epoch": 2.254054054054054, + "grad_norm": 3.025688409805298, + "learning_rate": 4.401704033617643e-06, + "loss": 0.6902, + "step": 417 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 3.3047802448272705, + "learning_rate": 4.398945499881366e-06, + "loss": 0.3552, + "step": 418 + }, + { + "epoch": 2.264864864864865, + "grad_norm": 3.0683655738830566, + "learning_rate": 4.396181490078949e-06, + "loss": 0.286, + "step": 419 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 3.627681016921997, + "learning_rate": 4.393412012181082e-06, + "loss": 0.4036, + "step": 420 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 4.552238941192627, + "learning_rate": 4.390637074174219e-06, + "loss": 0.8037, + "step": 421 + }, + { + "epoch": 2.281081081081081, + "grad_norm": 2.8688855171203613, + "learning_rate": 4.387856684060561e-06, + "loss": 0.2553, + "step": 422 + }, + { + "epoch": 2.2864864864864867, + "grad_norm": 4.21850061416626, + "learning_rate": 4.385070849858033e-06, + "loss": 0.6222, + "step": 423 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 3.038433790206909, + "learning_rate": 4.382279579600257e-06, + "loss": 0.5326, + "step": 424 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 3.297300338745117, + "learning_rate": 4.379482881336532e-06, + "loss": 0.5515, + "step": 425 + }, + { + "epoch": 2.3027027027027027, + "grad_norm": 7.162952423095703, + "learning_rate": 4.376680763131811e-06, + "loss": 0.6948, + "step": 426 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 3.2403595447540283, + "learning_rate": 4.373873233066676e-06, + "loss": 0.2947, + "step": 427 + }, + { + "epoch": 2.3135135135135134, + "grad_norm": 3.2969906330108643, + "learning_rate": 4.371060299237315e-06, + "loss": 0.2261, + "step": 428 + }, + { + "epoch": 2.3189189189189188, + "grad_norm": 2.669058322906494, + "learning_rate": 4.368241969755499e-06, + "loss": 0.5398, + "step": 429 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 2.7643518447875977, + "learning_rate": 4.36541825274856e-06, + "loss": 0.3301, + "step": 430 + }, + { + "epoch": 2.32972972972973, + "grad_norm": 3.6037657260894775, + "learning_rate": 4.3625891563593635e-06, + "loss": 0.6064, + "step": 431 + }, + { + "epoch": 2.3351351351351353, + "grad_norm": 2.8805618286132812, + "learning_rate": 4.35975468874629e-06, + "loss": 0.3897, + "step": 432 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 2.642402172088623, + "learning_rate": 4.356914858083211e-06, + "loss": 0.271, + "step": 433 + }, + { + "epoch": 2.345945945945946, + "grad_norm": 2.916337490081787, + "learning_rate": 4.354069672559458e-06, + "loss": 0.3681, + "step": 434 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 3.3312325477600098, + "learning_rate": 4.35121914037981e-06, + "loss": 0.298, + "step": 435 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 2.980583906173706, + "learning_rate": 4.348363269764462e-06, + "loss": 0.3618, + "step": 436 + }, + { + "epoch": 2.362162162162162, + "grad_norm": 3.5010197162628174, + "learning_rate": 4.345502068949003e-06, + "loss": 0.8972, + "step": 437 + }, + { + "epoch": 2.3675675675675674, + "grad_norm": 2.7187814712524414, + "learning_rate": 4.342635546184394e-06, + "loss": 0.3939, + "step": 438 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 2.8368170261383057, + "learning_rate": 4.339763709736944e-06, + "loss": 0.5462, + "step": 439 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 2.6989636421203613, + "learning_rate": 4.336886567888283e-06, + "loss": 0.5932, + "step": 440 + }, + { + "epoch": 2.383783783783784, + "grad_norm": 3.2514829635620117, + "learning_rate": 4.334004128935342e-06, + "loss": 0.4622, + "step": 441 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 5.242766857147217, + "learning_rate": 4.331116401190327e-06, + "loss": 0.5997, + "step": 442 + }, + { + "epoch": 2.3945945945945946, + "grad_norm": 3.492724657058716, + "learning_rate": 4.328223392980696e-06, + "loss": 0.3072, + "step": 443 + }, + { + "epoch": 2.4, + "grad_norm": 4.074132442474365, + "learning_rate": 4.325325112649134e-06, + "loss": 0.5338, + "step": 444 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 2.7208468914031982, + "learning_rate": 4.322421568553529e-06, + "loss": 0.3266, + "step": 445 + }, + { + "epoch": 2.410810810810811, + "grad_norm": 2.929180383682251, + "learning_rate": 4.3195127690669494e-06, + "loss": 0.4064, + "step": 446 + }, + { + "epoch": 2.4162162162162164, + "grad_norm": 2.848353624343872, + "learning_rate": 4.3165987225776186e-06, + "loss": 0.3856, + "step": 447 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 3.946488618850708, + "learning_rate": 4.313679437488889e-06, + "loss": 0.4261, + "step": 448 + }, + { + "epoch": 2.427027027027027, + "grad_norm": 5.781888961791992, + "learning_rate": 4.310754922219223e-06, + "loss": 0.4943, + "step": 449 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 2.8406941890716553, + "learning_rate": 4.307825185202164e-06, + "loss": 0.2874, + "step": 450 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 3.2017335891723633, + "learning_rate": 4.3048902348863116e-06, + "loss": 0.4218, + "step": 451 + }, + { + "epoch": 2.443243243243243, + "grad_norm": 3.8355906009674072, + "learning_rate": 4.301950079735303e-06, + "loss": 0.4204, + "step": 452 + }, + { + "epoch": 2.4486486486486485, + "grad_norm": 4.783357620239258, + "learning_rate": 4.299004728227782e-06, + "loss": 0.5593, + "step": 453 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 3.014080762863159, + "learning_rate": 4.2960541888573774e-06, + "loss": 0.4187, + "step": 454 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 3.5906598567962646, + "learning_rate": 4.29309847013268e-06, + "loss": 0.4193, + "step": 455 + }, + { + "epoch": 2.464864864864865, + "grad_norm": 3.9043331146240234, + "learning_rate": 4.290137580577216e-06, + "loss": 0.7035, + "step": 456 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 3.139753580093384, + "learning_rate": 4.287171528729423e-06, + "loss": 0.5877, + "step": 457 + }, + { + "epoch": 2.4756756756756757, + "grad_norm": 2.9091074466705322, + "learning_rate": 4.284200323142623e-06, + "loss": 0.5309, + "step": 458 + }, + { + "epoch": 2.481081081081081, + "grad_norm": 3.1253795623779297, + "learning_rate": 4.281223972385004e-06, + "loss": 0.448, + "step": 459 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 2.65510892868042, + "learning_rate": 4.27824248503959e-06, + "loss": 0.4453, + "step": 460 + }, + { + "epoch": 2.4918918918918918, + "grad_norm": 3.2135510444641113, + "learning_rate": 4.275255869704214e-06, + "loss": 0.5582, + "step": 461 + }, + { + "epoch": 2.4972972972972975, + "grad_norm": 2.452545404434204, + "learning_rate": 4.272264134991503e-06, + "loss": 0.423, + "step": 462 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 2.6370208263397217, + "learning_rate": 4.269267289528843e-06, + "loss": 0.271, + "step": 463 + }, + { + "epoch": 2.5081081081081082, + "grad_norm": 3.31266450881958, + "learning_rate": 4.266265341958356e-06, + "loss": 0.6459, + "step": 464 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 3.2743148803710938, + "learning_rate": 4.263258300936882e-06, + "loss": 0.2959, + "step": 465 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 2.883549690246582, + "learning_rate": 4.260246175135948e-06, + "loss": 0.3418, + "step": 466 + }, + { + "epoch": 2.5243243243243243, + "grad_norm": 2.7019498348236084, + "learning_rate": 4.257228973241742e-06, + "loss": 0.3459, + "step": 467 + }, + { + "epoch": 2.5297297297297296, + "grad_norm": 3.8166959285736084, + "learning_rate": 4.254206703955092e-06, + "loss": 0.4769, + "step": 468 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 3.264763593673706, + "learning_rate": 4.251179375991438e-06, + "loss": 0.6487, + "step": 469 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 2.7936933040618896, + "learning_rate": 4.248146998080808e-06, + "loss": 0.5547, + "step": 470 + }, + { + "epoch": 2.545945945945946, + "grad_norm": 3.21852707862854, + "learning_rate": 4.2451095789677945e-06, + "loss": 0.2965, + "step": 471 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 3.4528985023498535, + "learning_rate": 4.242067127411525e-06, + "loss": 0.3831, + "step": 472 + }, + { + "epoch": 2.556756756756757, + "grad_norm": 4.317023754119873, + "learning_rate": 4.239019652185642e-06, + "loss": 0.1756, + "step": 473 + }, + { + "epoch": 2.562162162162162, + "grad_norm": 3.677452325820923, + "learning_rate": 4.2359671620782725e-06, + "loss": 0.5136, + "step": 474 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 3.7563393115997314, + "learning_rate": 4.232909665892005e-06, + "loss": 0.6554, + "step": 475 + }, + { + "epoch": 2.572972972972973, + "grad_norm": 3.5125508308410645, + "learning_rate": 4.229847172443866e-06, + "loss": 0.3804, + "step": 476 + }, + { + "epoch": 2.5783783783783782, + "grad_norm": 2.8835806846618652, + "learning_rate": 4.2267796905652926e-06, + "loss": 0.3338, + "step": 477 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 3.2136261463165283, + "learning_rate": 4.223707229102105e-06, + "loss": 0.6163, + "step": 478 + }, + { + "epoch": 2.589189189189189, + "grad_norm": 3.467475175857544, + "learning_rate": 4.220629796914487e-06, + "loss": 0.3005, + "step": 479 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 3.597490072250366, + "learning_rate": 4.217547402876954e-06, + "loss": 0.56, + "step": 480 + }, + { + "epoch": 2.6, + "grad_norm": 3.2377140522003174, + "learning_rate": 4.214460055878329e-06, + "loss": 0.4512, + "step": 481 + }, + { + "epoch": 2.6054054054054054, + "grad_norm": 2.577746868133545, + "learning_rate": 4.211367764821722e-06, + "loss": 0.3074, + "step": 482 + }, + { + "epoch": 2.610810810810811, + "grad_norm": 3.6584155559539795, + "learning_rate": 4.208270538624497e-06, + "loss": 0.6752, + "step": 483 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 2.602778434753418, + "learning_rate": 4.205168386218251e-06, + "loss": 0.2347, + "step": 484 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 3.587503433227539, + "learning_rate": 4.2020613165487865e-06, + "loss": 0.5189, + "step": 485 + }, + { + "epoch": 2.627027027027027, + "grad_norm": 3.9341986179351807, + "learning_rate": 4.198949338576086e-06, + "loss": 0.7739, + "step": 486 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 2.9211957454681396, + "learning_rate": 4.1958324612742875e-06, + "loss": 0.3495, + "step": 487 + }, + { + "epoch": 2.637837837837838, + "grad_norm": 3.29193115234375, + "learning_rate": 4.1927106936316564e-06, + "loss": 0.2257, + "step": 488 + }, + { + "epoch": 2.6432432432432433, + "grad_norm": 3.3687057495117188, + "learning_rate": 4.189584044650559e-06, + "loss": 0.6708, + "step": 489 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 3.096428155899048, + "learning_rate": 4.186452523347441e-06, + "loss": 0.3126, + "step": 490 + }, + { + "epoch": 2.654054054054054, + "grad_norm": 3.0865559577941895, + "learning_rate": 4.183316138752799e-06, + "loss": 0.4219, + "step": 491 + }, + { + "epoch": 2.6594594594594594, + "grad_norm": 3.389827013015747, + "learning_rate": 4.180174899911149e-06, + "loss": 0.3937, + "step": 492 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 3.044360637664795, + "learning_rate": 4.177028815881012e-06, + "loss": 0.4098, + "step": 493 + }, + { + "epoch": 2.6702702702702705, + "grad_norm": 2.813094139099121, + "learning_rate": 4.173877895734875e-06, + "loss": 0.3597, + "step": 494 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 2.4037158489227295, + "learning_rate": 4.1707221485591764e-06, + "loss": 0.3284, + "step": 495 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 3.049436092376709, + "learning_rate": 4.167561583454272e-06, + "loss": 0.257, + "step": 496 + }, + { + "epoch": 2.6864864864864866, + "grad_norm": 3.458923816680908, + "learning_rate": 4.164396209534411e-06, + "loss": 0.1819, + "step": 497 + }, + { + "epoch": 2.691891891891892, + "grad_norm": 3.3084232807159424, + "learning_rate": 4.161226035927711e-06, + "loss": 0.7109, + "step": 498 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 3.034550189971924, + "learning_rate": 4.15805107177613e-06, + "loss": 0.6297, + "step": 499 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 3.5786449909210205, + "learning_rate": 4.15487132623544e-06, + "loss": 0.5195, + "step": 500 + }, + { + "epoch": 2.708108108108108, + "grad_norm": 3.4477646350860596, + "learning_rate": 4.151686808475204e-06, + "loss": 0.2528, + "step": 501 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 3.0256869792938232, + "learning_rate": 4.148497527678744e-06, + "loss": 0.5013, + "step": 502 + }, + { + "epoch": 2.718918918918919, + "grad_norm": 2.875121593475342, + "learning_rate": 4.145303493043118e-06, + "loss": 0.4109, + "step": 503 + }, + { + "epoch": 2.7243243243243245, + "grad_norm": 2.7204222679138184, + "learning_rate": 4.1421047137790935e-06, + "loss": 0.3197, + "step": 504 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 3.350482702255249, + "learning_rate": 4.13890119911112e-06, + "loss": 0.6369, + "step": 505 + }, + { + "epoch": 2.735135135135135, + "grad_norm": 3.096774101257324, + "learning_rate": 4.135692958277303e-06, + "loss": 0.4581, + "step": 506 + }, + { + "epoch": 2.7405405405405405, + "grad_norm": 2.8896536827087402, + "learning_rate": 4.132480000529375e-06, + "loss": 0.6217, + "step": 507 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 2.643932580947876, + "learning_rate": 4.129262335132676e-06, + "loss": 0.4951, + "step": 508 + }, + { + "epoch": 2.7513513513513512, + "grad_norm": 2.6077864170074463, + "learning_rate": 4.126039971366114e-06, + "loss": 0.2185, + "step": 509 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 2.531507968902588, + "learning_rate": 4.122812918522154e-06, + "loss": 0.5428, + "step": 510 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 4.125836372375488, + "learning_rate": 4.119581185906776e-06, + "loss": 0.5466, + "step": 511 + }, + { + "epoch": 2.7675675675675677, + "grad_norm": 2.9921016693115234, + "learning_rate": 4.1163447828394595e-06, + "loss": 0.3803, + "step": 512 + }, + { + "epoch": 2.772972972972973, + "grad_norm": 2.9517931938171387, + "learning_rate": 4.113103718653152e-06, + "loss": 0.2722, + "step": 513 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 2.8333382606506348, + "learning_rate": 4.10985800269424e-06, + "loss": 0.333, + "step": 514 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 2.94168758392334, + "learning_rate": 4.106607644322529e-06, + "loss": 0.2186, + "step": 515 + }, + { + "epoch": 2.789189189189189, + "grad_norm": 3.2743892669677734, + "learning_rate": 4.103352652911207e-06, + "loss": 0.6365, + "step": 516 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 4.692770004272461, + "learning_rate": 4.100093037846825e-06, + "loss": 0.7261, + "step": 517 + }, + { + "epoch": 2.8, + "grad_norm": 3.2157247066497803, + "learning_rate": 4.0968288085292675e-06, + "loss": 0.2767, + "step": 518 + }, + { + "epoch": 2.8054054054054056, + "grad_norm": 3.196887731552124, + "learning_rate": 4.093559974371725e-06, + "loss": 0.4743, + "step": 519 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 2.406752586364746, + "learning_rate": 4.090286544800667e-06, + "loss": 0.3789, + "step": 520 + }, + { + "epoch": 2.8162162162162163, + "grad_norm": 3.1769447326660156, + "learning_rate": 4.087008529255815e-06, + "loss": 0.6252, + "step": 521 + }, + { + "epoch": 2.8216216216216217, + "grad_norm": 3.068370819091797, + "learning_rate": 4.083725937190115e-06, + "loss": 0.3467, + "step": 522 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 3.2665855884552, + "learning_rate": 4.0804387780697114e-06, + "loss": 0.3857, + "step": 523 + }, + { + "epoch": 2.8324324324324324, + "grad_norm": 3.368759870529175, + "learning_rate": 4.077147061373918e-06, + "loss": 0.4679, + "step": 524 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 3.989163875579834, + "learning_rate": 4.073850796595192e-06, + "loss": 0.2439, + "step": 525 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 3.6244685649871826, + "learning_rate": 4.070549993239106e-06, + "loss": 0.435, + "step": 526 + }, + { + "epoch": 2.8486486486486484, + "grad_norm": 3.585151195526123, + "learning_rate": 4.06724466082432e-06, + "loss": 0.5022, + "step": 527 + }, + { + "epoch": 2.854054054054054, + "grad_norm": 3.2420976161956787, + "learning_rate": 4.063934808882555e-06, + "loss": 0.4282, + "step": 528 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 3.1674294471740723, + "learning_rate": 4.0606204469585656e-06, + "loss": 0.3436, + "step": 529 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 2.6856706142425537, + "learning_rate": 4.057301584610112e-06, + "loss": 0.3889, + "step": 530 + }, + { + "epoch": 2.8702702702702703, + "grad_norm": 3.0438942909240723, + "learning_rate": 4.053978231407931e-06, + "loss": 0.4828, + "step": 531 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 3.3561246395111084, + "learning_rate": 4.0506503969357115e-06, + "loss": 0.5814, + "step": 532 + }, + { + "epoch": 2.881081081081081, + "grad_norm": 2.5318350791931152, + "learning_rate": 4.047318090790065e-06, + "loss": 0.4768, + "step": 533 + }, + { + "epoch": 2.8864864864864863, + "grad_norm": 2.587224006652832, + "learning_rate": 4.043981322580498e-06, + "loss": 0.4262, + "step": 534 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 2.73926043510437, + "learning_rate": 4.040640101929384e-06, + "loss": 0.421, + "step": 535 + }, + { + "epoch": 2.8972972972972975, + "grad_norm": 3.53908371925354, + "learning_rate": 4.037294438471936e-06, + "loss": 0.4019, + "step": 536 + }, + { + "epoch": 2.902702702702703, + "grad_norm": 3.0980448722839355, + "learning_rate": 4.033944341856181e-06, + "loss": 0.4322, + "step": 537 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 2.9265666007995605, + "learning_rate": 4.030589821742926e-06, + "loss": 0.3841, + "step": 538 + }, + { + "epoch": 2.9135135135135135, + "grad_norm": 3.4082043170928955, + "learning_rate": 4.0272308878057385e-06, + "loss": 0.7083, + "step": 539 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 3.297515630722046, + "learning_rate": 4.023867549730912e-06, + "loss": 0.5688, + "step": 540 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 3.0538225173950195, + "learning_rate": 4.020499817217441e-06, + "loss": 0.5979, + "step": 541 + }, + { + "epoch": 2.92972972972973, + "grad_norm": 3.1792757511138916, + "learning_rate": 4.017127699976992e-06, + "loss": 0.5034, + "step": 542 + }, + { + "epoch": 2.935135135135135, + "grad_norm": 3.1574482917785645, + "learning_rate": 4.013751207733877e-06, + "loss": 0.6656, + "step": 543 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 2.523123264312744, + "learning_rate": 4.010370350225023e-06, + "loss": 0.2789, + "step": 544 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 3.1950793266296387, + "learning_rate": 4.006985137199945e-06, + "loss": 0.2163, + "step": 545 + }, + { + "epoch": 2.9513513513513514, + "grad_norm": 3.2089648246765137, + "learning_rate": 4.00359557842072e-06, + "loss": 0.4179, + "step": 546 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 3.852578639984131, + "learning_rate": 4.000201683661958e-06, + "loss": 0.4683, + "step": 547 + }, + { + "epoch": 2.962162162162162, + "grad_norm": 2.7612597942352295, + "learning_rate": 3.996803462710766e-06, + "loss": 0.3506, + "step": 548 + }, + { + "epoch": 2.9675675675675675, + "grad_norm": 4.811823844909668, + "learning_rate": 3.993400925366736e-06, + "loss": 0.6582, + "step": 549 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 3.0135858058929443, + "learning_rate": 3.989994081441902e-06, + "loss": 0.504, + "step": 550 + }, + { + "epoch": 2.9783783783783786, + "grad_norm": 2.710277795791626, + "learning_rate": 3.986582940760717e-06, + "loss": 0.7362, + "step": 551 + }, + { + "epoch": 2.983783783783784, + "grad_norm": 3.175443649291992, + "learning_rate": 3.983167513160025e-06, + "loss": 0.4116, + "step": 552 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 3.101109743118286, + "learning_rate": 3.979747808489036e-06, + "loss": 0.2188, + "step": 553 + }, + { + "epoch": 2.9945945945945946, + "grad_norm": 3.2320079803466797, + "learning_rate": 3.976323836609289e-06, + "loss": 0.7558, + "step": 554 + }, + { + "epoch": 3.0, + "grad_norm": 3.6071934700012207, + "learning_rate": 3.9728956073946305e-06, + "loss": 0.6491, + "step": 555 + }, + { + "epoch": 3.0054054054054054, + "grad_norm": 3.1119353771209717, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1625, + "step": 556 + }, + { + "epoch": 3.0108108108108107, + "grad_norm": 3.0440328121185303, + "learning_rate": 3.966026416517321e-06, + "loss": 0.311, + "step": 557 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 4.069122791290283, + "learning_rate": 3.962585474663636e-06, + "loss": 0.5299, + "step": 558 + }, + { + "epoch": 3.0216216216216214, + "grad_norm": 2.878645896911621, + "learning_rate": 3.959140315092911e-06, + "loss": 0.2718, + "step": 559 + }, + { + "epoch": 3.027027027027027, + "grad_norm": 3.526695966720581, + "learning_rate": 3.955690947740092e-06, + "loss": 0.2954, + "step": 560 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 3.25087308883667, + "learning_rate": 3.95223738255226e-06, + "loss": 0.2388, + "step": 561 + }, + { + "epoch": 3.037837837837838, + "grad_norm": 3.5467700958251953, + "learning_rate": 3.9487796294886015e-06, + "loss": 0.2014, + "step": 562 + }, + { + "epoch": 3.0432432432432432, + "grad_norm": 4.397517681121826, + "learning_rate": 3.945317698520379e-06, + "loss": 0.2102, + "step": 563 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 3.7297182083129883, + "learning_rate": 3.941851599630903e-06, + "loss": 0.499, + "step": 564 + }, + { + "epoch": 3.054054054054054, + "grad_norm": 4.417158603668213, + "learning_rate": 3.938381342815503e-06, + "loss": 0.3392, + "step": 565 + }, + { + "epoch": 3.0594594594594593, + "grad_norm": 4.6037421226501465, + "learning_rate": 3.934906938081499e-06, + "loss": 0.1942, + "step": 566 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 3.5600531101226807, + "learning_rate": 3.931428395448174e-06, + "loss": 0.1753, + "step": 567 + }, + { + "epoch": 3.0702702702702704, + "grad_norm": 2.868013381958008, + "learning_rate": 3.927945724946743e-06, + "loss": 0.2959, + "step": 568 + }, + { + "epoch": 3.075675675675676, + "grad_norm": 3.5543227195739746, + "learning_rate": 3.924458936620322e-06, + "loss": 0.4625, + "step": 569 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 8.972922325134277, + "learning_rate": 3.920968040523904e-06, + "loss": 0.2571, + "step": 570 + }, + { + "epoch": 3.0864864864864865, + "grad_norm": 3.037388324737549, + "learning_rate": 3.917473046724329e-06, + "loss": 0.1438, + "step": 571 + }, + { + "epoch": 3.091891891891892, + "grad_norm": 3.3261702060699463, + "learning_rate": 3.9139739653002525e-06, + "loss": 0.3572, + "step": 572 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 2.425293207168579, + "learning_rate": 3.910470806342117e-06, + "loss": 0.165, + "step": 573 + }, + { + "epoch": 3.1027027027027025, + "grad_norm": 3.5718603134155273, + "learning_rate": 3.9069635799521245e-06, + "loss": 0.3209, + "step": 574 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 3.8211171627044678, + "learning_rate": 3.903452296244204e-06, + "loss": 0.1976, + "step": 575 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 5.944535255432129, + "learning_rate": 3.899936965343989e-06, + "loss": 0.6074, + "step": 576 + }, + { + "epoch": 3.118918918918919, + "grad_norm": 6.603860378265381, + "learning_rate": 3.89641759738878e-06, + "loss": 0.4051, + "step": 577 + }, + { + "epoch": 3.1243243243243244, + "grad_norm": 6.712981700897217, + "learning_rate": 3.892894202527523e-06, + "loss": 0.3787, + "step": 578 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 3.267186403274536, + "learning_rate": 3.8893667909207735e-06, + "loss": 0.0927, + "step": 579 + }, + { + "epoch": 3.135135135135135, + "grad_norm": 4.476837158203125, + "learning_rate": 3.88583537274067e-06, + "loss": 0.4706, + "step": 580 + }, + { + "epoch": 3.1405405405405404, + "grad_norm": 4.272335052490234, + "learning_rate": 3.8822999581709085e-06, + "loss": 0.3949, + "step": 581 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 3.6685309410095215, + "learning_rate": 3.878760557406708e-06, + "loss": 0.1971, + "step": 582 + }, + { + "epoch": 3.1513513513513516, + "grad_norm": 3.9899449348449707, + "learning_rate": 3.875217180654779e-06, + "loss": 0.5156, + "step": 583 + }, + { + "epoch": 3.156756756756757, + "grad_norm": 3.866804361343384, + "learning_rate": 3.871669838133303e-06, + "loss": 0.3552, + "step": 584 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 3.565648317337036, + "learning_rate": 3.868118540071894e-06, + "loss": 0.4369, + "step": 585 + }, + { + "epoch": 3.1675675675675676, + "grad_norm": 3.5073986053466797, + "learning_rate": 3.8645632967115755e-06, + "loss": 0.3694, + "step": 586 + }, + { + "epoch": 3.172972972972973, + "grad_norm": 3.7636868953704834, + "learning_rate": 3.861004118304746e-06, + "loss": 0.3404, + "step": 587 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 2.940094232559204, + "learning_rate": 3.857441015115154e-06, + "loss": 0.3086, + "step": 588 + }, + { + "epoch": 3.1837837837837837, + "grad_norm": 3.727414608001709, + "learning_rate": 3.8538739974178635e-06, + "loss": 0.253, + "step": 589 + }, + { + "epoch": 3.189189189189189, + "grad_norm": 3.5140156745910645, + "learning_rate": 3.850303075499227e-06, + "loss": 0.2436, + "step": 590 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 3.545952558517456, + "learning_rate": 3.84672825965686e-06, + "loss": 0.328, + "step": 591 + }, + { + "epoch": 3.2, + "grad_norm": 3.534240484237671, + "learning_rate": 3.843149560199601e-06, + "loss": 0.2687, + "step": 592 + }, + { + "epoch": 3.2054054054054055, + "grad_norm": 2.8464927673339844, + "learning_rate": 3.839566987447492e-06, + "loss": 0.1417, + "step": 593 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 4.138559818267822, + "learning_rate": 3.835980551731743e-06, + "loss": 0.2106, + "step": 594 + }, + { + "epoch": 3.2162162162162162, + "grad_norm": 2.917670249938965, + "learning_rate": 3.8323902633947045e-06, + "loss": 0.3154, + "step": 595 + }, + { + "epoch": 3.2216216216216216, + "grad_norm": 3.029660224914551, + "learning_rate": 3.828796132789835e-06, + "loss": 0.1218, + "step": 596 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 3.2845771312713623, + "learning_rate": 3.825198170281677e-06, + "loss": 0.1336, + "step": 597 + }, + { + "epoch": 3.2324324324324323, + "grad_norm": 3.1375670433044434, + "learning_rate": 3.821596386245819e-06, + "loss": 0.2518, + "step": 598 + }, + { + "epoch": 3.237837837837838, + "grad_norm": 3.0021941661834717, + "learning_rate": 3.817990791068874e-06, + "loss": 0.2762, + "step": 599 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 4.141000747680664, + "learning_rate": 3.81438139514844e-06, + "loss": 0.2722, + "step": 600 + }, + { + "epoch": 3.2486486486486488, + "grad_norm": 3.9065279960632324, + "learning_rate": 3.8107682088930797e-06, + "loss": 0.3542, + "step": 601 + }, + { + "epoch": 3.254054054054054, + "grad_norm": 3.718417167663574, + "learning_rate": 3.807151242722286e-06, + "loss": 0.344, + "step": 602 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 4.013717174530029, + "learning_rate": 3.8035305070664484e-06, + "loss": 0.1625, + "step": 603 + }, + { + "epoch": 3.264864864864865, + "grad_norm": 3.348888397216797, + "learning_rate": 3.7999060123668318e-06, + "loss": 0.2925, + "step": 604 + }, + { + "epoch": 3.27027027027027, + "grad_norm": 3.496079206466675, + "learning_rate": 3.7962777690755364e-06, + "loss": 0.1523, + "step": 605 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 3.07607102394104, + "learning_rate": 3.792645787655476e-06, + "loss": 0.1674, + "step": 606 + }, + { + "epoch": 3.281081081081081, + "grad_norm": 3.4036154747009277, + "learning_rate": 3.7890100785803425e-06, + "loss": 0.2856, + "step": 607 + }, + { + "epoch": 3.2864864864864867, + "grad_norm": 6.092559337615967, + "learning_rate": 3.785370652334577e-06, + "loss": 0.1094, + "step": 608 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 3.9322001934051514, + "learning_rate": 3.7817275194133403e-06, + "loss": 0.2611, + "step": 609 + }, + { + "epoch": 3.2972972972972974, + "grad_norm": 3.189563274383545, + "learning_rate": 3.778080690322483e-06, + "loss": 0.1315, + "step": 610 + }, + { + "epoch": 3.3027027027027027, + "grad_norm": 4.304934024810791, + "learning_rate": 3.774430175578514e-06, + "loss": 0.1686, + "step": 611 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 2.9030067920684814, + "learning_rate": 3.7707759857085706e-06, + "loss": 0.4642, + "step": 612 + }, + { + "epoch": 3.3135135135135134, + "grad_norm": 3.7485930919647217, + "learning_rate": 3.7671181312503886e-06, + "loss": 0.1987, + "step": 613 + }, + { + "epoch": 3.3189189189189188, + "grad_norm": 3.4700896739959717, + "learning_rate": 3.763456622752271e-06, + "loss": 0.3307, + "step": 614 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 3.0079376697540283, + "learning_rate": 3.7597914707730583e-06, + "loss": 0.1731, + "step": 615 + }, + { + "epoch": 3.32972972972973, + "grad_norm": 3.155235767364502, + "learning_rate": 3.7561226858820984e-06, + "loss": 0.2003, + "step": 616 + }, + { + "epoch": 3.3351351351351353, + "grad_norm": 3.847895622253418, + "learning_rate": 3.7524502786592143e-06, + "loss": 0.4014, + "step": 617 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 2.7505502700805664, + "learning_rate": 3.7487742596946753e-06, + "loss": 0.205, + "step": 618 + }, + { + "epoch": 3.345945945945946, + "grad_norm": 3.654529571533203, + "learning_rate": 3.7450946395891674e-06, + "loss": 0.2932, + "step": 619 + }, + { + "epoch": 3.3513513513513513, + "grad_norm": 2.9763967990875244, + "learning_rate": 3.7414114289537593e-06, + "loss": 0.2748, + "step": 620 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 3.889683961868286, + "learning_rate": 3.7377246384098763e-06, + "loss": 0.3665, + "step": 621 + }, + { + "epoch": 3.362162162162162, + "grad_norm": 4.193166732788086, + "learning_rate": 3.7340342785892645e-06, + "loss": 0.3453, + "step": 622 + }, + { + "epoch": 3.3675675675675674, + "grad_norm": 3.4371488094329834, + "learning_rate": 3.7303403601339646e-06, + "loss": 0.473, + "step": 623 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 3.6939027309417725, + "learning_rate": 3.726642893696279e-06, + "loss": 0.3017, + "step": 624 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 4.904304504394531, + "learning_rate": 3.7229418899387414e-06, + "loss": 0.4841, + "step": 625 + }, + { + "epoch": 3.383783783783784, + "grad_norm": 3.6373438835144043, + "learning_rate": 3.719237359534087e-06, + "loss": 0.3879, + "step": 626 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 3.403676986694336, + "learning_rate": 3.71552931316522e-06, + "loss": 0.3876, + "step": 627 + }, + { + "epoch": 3.3945945945945946, + "grad_norm": 3.2292237281799316, + "learning_rate": 3.7118177615251834e-06, + "loss": 0.4491, + "step": 628 + }, + { + "epoch": 3.4, + "grad_norm": 3.317850351333618, + "learning_rate": 3.70810271531713e-06, + "loss": 0.3763, + "step": 629 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 3.664735794067383, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4171, + "step": 630 + }, + { + "epoch": 3.410810810810811, + "grad_norm": 3.781569242477417, + "learning_rate": 3.700662182059936e-06, + "loss": 0.2445, + "step": 631 + }, + { + "epoch": 3.4162162162162164, + "grad_norm": 2.878260850906372, + "learning_rate": 3.696936716467363e-06, + "loss": 0.1347, + "step": 632 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 2.8670761585235596, + "learning_rate": 3.693207799219846e-06, + "loss": 0.2822, + "step": 633 + }, + { + "epoch": 3.427027027027027, + "grad_norm": 3.9338245391845703, + "learning_rate": 3.689475441070615e-06, + "loss": 0.3425, + "step": 634 + }, + { + "epoch": 3.4324324324324325, + "grad_norm": 3.3172149658203125, + "learning_rate": 3.685739652782822e-06, + "loss": 0.3315, + "step": 635 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 3.9986648559570312, + "learning_rate": 3.682000445129512e-06, + "loss": 0.1841, + "step": 636 + }, + { + "epoch": 3.443243243243243, + "grad_norm": 3.4503986835479736, + "learning_rate": 3.6782578288935896e-06, + "loss": 0.3151, + "step": 637 + }, + { + "epoch": 3.4486486486486485, + "grad_norm": 3.8826167583465576, + "learning_rate": 3.6745118148677882e-06, + "loss": 0.1272, + "step": 638 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 3.0585904121398926, + "learning_rate": 3.6707624138546414e-06, + "loss": 0.2436, + "step": 639 + }, + { + "epoch": 3.4594594594594597, + "grad_norm": 3.8409557342529297, + "learning_rate": 3.6670096366664477e-06, + "loss": 0.6321, + "step": 640 + }, + { + "epoch": 3.464864864864865, + "grad_norm": 3.7260093688964844, + "learning_rate": 3.663253494125244e-06, + "loss": 0.1262, + "step": 641 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 3.195587396621704, + "learning_rate": 3.6594939970627706e-06, + "loss": 0.2669, + "step": 642 + }, + { + "epoch": 3.4756756756756757, + "grad_norm": 2.565070629119873, + "learning_rate": 3.655731156320441e-06, + "loss": 0.1228, + "step": 643 + }, + { + "epoch": 3.481081081081081, + "grad_norm": 3.745422124862671, + "learning_rate": 3.651964982749312e-06, + "loss": 0.1759, + "step": 644 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 4.96168327331543, + "learning_rate": 3.648195487210051e-06, + "loss": 0.5677, + "step": 645 + }, + { + "epoch": 3.4918918918918918, + "grad_norm": 3.514446496963501, + "learning_rate": 3.644422680572906e-06, + "loss": 0.1874, + "step": 646 + }, + { + "epoch": 3.4972972972972975, + "grad_norm": 3.1427719593048096, + "learning_rate": 3.640646573717671e-06, + "loss": 0.3225, + "step": 647 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 3.32208514213562, + "learning_rate": 3.63686717753366e-06, + "loss": 0.102, + "step": 648 + }, + { + "epoch": 3.5081081081081082, + "grad_norm": 3.409299373626709, + "learning_rate": 3.6330845029196697e-06, + "loss": 0.1585, + "step": 649 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 2.827052116394043, + "learning_rate": 3.629298560783952e-06, + "loss": 0.3046, + "step": 650 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 3.541518211364746, + "learning_rate": 3.6255093620441835e-06, + "loss": 0.2037, + "step": 651 + }, + { + "epoch": 3.5243243243243243, + "grad_norm": 3.067040205001831, + "learning_rate": 3.6217169176274293e-06, + "loss": 0.1784, + "step": 652 + }, + { + "epoch": 3.5297297297297296, + "grad_norm": 4.001040935516357, + "learning_rate": 3.6179212384701146e-06, + "loss": 0.1974, + "step": 653 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 4.03037691116333, + "learning_rate": 3.6141223355179946e-06, + "loss": 0.2161, + "step": 654 + }, + { + "epoch": 3.5405405405405403, + "grad_norm": 3.303591728210449, + "learning_rate": 3.610320219726118e-06, + "loss": 0.1487, + "step": 655 + }, + { + "epoch": 3.545945945945946, + "grad_norm": 4.183008193969727, + "learning_rate": 3.606514902058802e-06, + "loss": 0.2231, + "step": 656 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 4.2100300788879395, + "learning_rate": 3.602706393489594e-06, + "loss": 0.5068, + "step": 657 + }, + { + "epoch": 3.556756756756757, + "grad_norm": 4.521003246307373, + "learning_rate": 3.598894705001246e-06, + "loss": 0.4621, + "step": 658 + }, + { + "epoch": 3.562162162162162, + "grad_norm": 3.452348470687866, + "learning_rate": 3.5950798475856783e-06, + "loss": 0.285, + "step": 659 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 3.468987464904785, + "learning_rate": 3.5912618322439487e-06, + "loss": 0.4277, + "step": 660 + }, + { + "epoch": 3.572972972972973, + "grad_norm": 3.431551933288574, + "learning_rate": 3.587440669986224e-06, + "loss": 0.1993, + "step": 661 + }, + { + "epoch": 3.5783783783783782, + "grad_norm": 3.017648220062256, + "learning_rate": 3.5836163718317453e-06, + "loss": 0.272, + "step": 662 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 3.837244987487793, + "learning_rate": 3.5797889488087946e-06, + "loss": 0.6019, + "step": 663 + }, + { + "epoch": 3.589189189189189, + "grad_norm": 3.221762180328369, + "learning_rate": 3.575958411954668e-06, + "loss": 0.3603, + "step": 664 + }, + { + "epoch": 3.5945945945945947, + "grad_norm": 4.279484272003174, + "learning_rate": 3.5721247723156393e-06, + "loss": 0.4656, + "step": 665 + }, + { + "epoch": 3.6, + "grad_norm": 3.723459243774414, + "learning_rate": 3.5682880409469316e-06, + "loss": 0.2466, + "step": 666 + }, + { + "epoch": 3.6054054054054054, + "grad_norm": 2.7260632514953613, + "learning_rate": 3.564448228912682e-06, + "loss": 0.1848, + "step": 667 + }, + { + "epoch": 3.610810810810811, + "grad_norm": 3.6656649112701416, + "learning_rate": 3.5606053472859124e-06, + "loss": 0.4968, + "step": 668 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 4.570294380187988, + "learning_rate": 3.556759407148496e-06, + "loss": 0.316, + "step": 669 + }, + { + "epoch": 3.6216216216216215, + "grad_norm": 3.174433946609497, + "learning_rate": 3.5529104195911258e-06, + "loss": 0.2232, + "step": 670 + }, + { + "epoch": 3.627027027027027, + "grad_norm": 4.481954574584961, + "learning_rate": 3.549058395713285e-06, + "loss": 0.4435, + "step": 671 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 3.8758301734924316, + "learning_rate": 3.54520334662321e-06, + "loss": 0.1455, + "step": 672 + }, + { + "epoch": 3.637837837837838, + "grad_norm": 3.1699628829956055, + "learning_rate": 3.5413452834378626e-06, + "loss": 0.3037, + "step": 673 + }, + { + "epoch": 3.6432432432432433, + "grad_norm": 3.8971962928771973, + "learning_rate": 3.5374842172828953e-06, + "loss": 0.4309, + "step": 674 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 3.3087549209594727, + "learning_rate": 3.533620159292621e-06, + "loss": 0.383, + "step": 675 + }, + { + "epoch": 3.654054054054054, + "grad_norm": 2.9413082599639893, + "learning_rate": 3.529753120609982e-06, + "loss": 0.1963, + "step": 676 + }, + { + "epoch": 3.6594594594594594, + "grad_norm": 3.309837818145752, + "learning_rate": 3.5258831123865136e-06, + "loss": 0.1922, + "step": 677 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 4.124879360198975, + "learning_rate": 3.5220101457823147e-06, + "loss": 0.5589, + "step": 678 + }, + { + "epoch": 3.6702702702702705, + "grad_norm": 3.2587103843688965, + "learning_rate": 3.5181342319660174e-06, + "loss": 0.1757, + "step": 679 + }, + { + "epoch": 3.6756756756756754, + "grad_norm": 4.179666042327881, + "learning_rate": 3.5142553821147498e-06, + "loss": 0.1208, + "step": 680 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 3.4041192531585693, + "learning_rate": 3.5103736074141106e-06, + "loss": 0.2416, + "step": 681 + }, + { + "epoch": 3.6864864864864866, + "grad_norm": 4.982706546783447, + "learning_rate": 3.5064889190581293e-06, + "loss": 0.3841, + "step": 682 + }, + { + "epoch": 3.691891891891892, + "grad_norm": 3.5895309448242188, + "learning_rate": 3.5026013282492406e-06, + "loss": 0.3723, + "step": 683 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 3.4824306964874268, + "learning_rate": 3.498710846198247e-06, + "loss": 0.4403, + "step": 684 + }, + { + "epoch": 3.7027027027027026, + "grad_norm": 3.501023054122925, + "learning_rate": 3.494817484124289e-06, + "loss": 0.2813, + "step": 685 + }, + { + "epoch": 3.708108108108108, + "grad_norm": 3.934908151626587, + "learning_rate": 3.490921253254813e-06, + "loss": 0.4287, + "step": 686 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 3.24141526222229, + "learning_rate": 3.487022164825539e-06, + "loss": 0.234, + "step": 687 + }, + { + "epoch": 3.718918918918919, + "grad_norm": 3.3419880867004395, + "learning_rate": 3.4831202300804246e-06, + "loss": 0.2135, + "step": 688 + }, + { + "epoch": 3.7243243243243245, + "grad_norm": 3.923778772354126, + "learning_rate": 3.479215460271638e-06, + "loss": 0.2725, + "step": 689 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 3.2432096004486084, + "learning_rate": 3.475307866659522e-06, + "loss": 0.228, + "step": 690 + }, + { + "epoch": 3.735135135135135, + "grad_norm": 3.0307705402374268, + "learning_rate": 3.4713974605125634e-06, + "loss": 0.0985, + "step": 691 + }, + { + "epoch": 3.7405405405405405, + "grad_norm": 2.778942346572876, + "learning_rate": 3.4674842531073587e-06, + "loss": 0.2137, + "step": 692 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 3.711315155029297, + "learning_rate": 3.4635682557285833e-06, + "loss": 0.1707, + "step": 693 + }, + { + "epoch": 3.7513513513513512, + "grad_norm": 3.165668487548828, + "learning_rate": 3.459649479668956e-06, + "loss": 0.3021, + "step": 694 + }, + { + "epoch": 3.756756756756757, + "grad_norm": 3.7491254806518555, + "learning_rate": 3.4557279362292117e-06, + "loss": 0.3457, + "step": 695 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 3.271603584289551, + "learning_rate": 3.451803636718064e-06, + "loss": 0.1193, + "step": 696 + }, + { + "epoch": 3.7675675675675677, + "grad_norm": 3.872382402420044, + "learning_rate": 3.447876592452174e-06, + "loss": 0.2261, + "step": 697 + }, + { + "epoch": 3.772972972972973, + "grad_norm": 4.634008407592773, + "learning_rate": 3.4439468147561196e-06, + "loss": 0.5042, + "step": 698 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 3.6930148601531982, + "learning_rate": 3.440014314962358e-06, + "loss": 0.3481, + "step": 699 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 4.709466457366943, + "learning_rate": 3.4360791044112e-06, + "loss": 0.2317, + "step": 700 + }, + { + "epoch": 3.789189189189189, + "grad_norm": 4.37923002243042, + "learning_rate": 3.432141194450772e-06, + "loss": 0.395, + "step": 701 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 3.1600489616394043, + "learning_rate": 3.4282005964369836e-06, + "loss": 0.1767, + "step": 702 + }, + { + "epoch": 3.8, + "grad_norm": 3.9799487590789795, + "learning_rate": 3.424257321733497e-06, + "loss": 0.2146, + "step": 703 + }, + { + "epoch": 3.8054054054054056, + "grad_norm": 2.79176664352417, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.1534, + "step": 704 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 3.0024254322052, + "learning_rate": 3.4163627877506434e-06, + "loss": 0.2513, + "step": 705 + }, + { + "epoch": 3.8162162162162163, + "grad_norm": 2.924475908279419, + "learning_rate": 3.4124115512370636e-06, + "loss": 0.4154, + "step": 706 + }, + { + "epoch": 3.8216216216216217, + "grad_norm": 3.2713992595672607, + "learning_rate": 3.408457683565295e-06, + "loss": 0.1822, + "step": 707 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 3.094003438949585, + "learning_rate": 3.4045011961372675e-06, + "loss": 0.3589, + "step": 708 + }, + { + "epoch": 3.8324324324324324, + "grad_norm": 3.423858404159546, + "learning_rate": 3.4005421003624637e-06, + "loss": 0.4615, + "step": 709 + }, + { + "epoch": 3.8378378378378377, + "grad_norm": 2.038792848587036, + "learning_rate": 3.3965804076578896e-06, + "loss": 0.1001, + "step": 710 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 2.6447055339813232, + "learning_rate": 3.392616129448039e-06, + "loss": 0.2788, + "step": 711 + }, + { + "epoch": 3.8486486486486484, + "grad_norm": 3.546876907348633, + "learning_rate": 3.3886492771648593e-06, + "loss": 0.2663, + "step": 712 + }, + { + "epoch": 3.854054054054054, + "grad_norm": 2.9587066173553467, + "learning_rate": 3.384679862247726e-06, + "loss": 0.3497, + "step": 713 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 3.7122113704681396, + "learning_rate": 3.3807078961434013e-06, + "loss": 0.3613, + "step": 714 + }, + { + "epoch": 3.864864864864865, + "grad_norm": 3.157294988632202, + "learning_rate": 3.376733390306004e-06, + "loss": 0.0783, + "step": 715 + }, + { + "epoch": 3.8702702702702703, + "grad_norm": 3.564279317855835, + "learning_rate": 3.372756356196979e-06, + "loss": 0.1617, + "step": 716 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 4.231864929199219, + "learning_rate": 3.3687768052850595e-06, + "loss": 0.6444, + "step": 717 + }, + { + "epoch": 3.881081081081081, + "grad_norm": 5.480365753173828, + "learning_rate": 3.364794749046239e-06, + "loss": 0.4858, + "step": 718 + }, + { + "epoch": 3.8864864864864863, + "grad_norm": 3.428140878677368, + "learning_rate": 3.3608101989637333e-06, + "loss": 0.3103, + "step": 719 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 3.521989345550537, + "learning_rate": 3.356823166527952e-06, + "loss": 0.2501, + "step": 720 + }, + { + "epoch": 3.8972972972972975, + "grad_norm": 3.287081718444824, + "learning_rate": 3.352833663236463e-06, + "loss": 0.18, + "step": 721 + }, + { + "epoch": 3.902702702702703, + "grad_norm": 3.323146104812622, + "learning_rate": 3.348841700593956e-06, + "loss": 0.12, + "step": 722 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 3.516693115234375, + "learning_rate": 3.3448472901122187e-06, + "loss": 0.2618, + "step": 723 + }, + { + "epoch": 3.9135135135135135, + "grad_norm": 3.8109545707702637, + "learning_rate": 3.340850443310092e-06, + "loss": 0.3689, + "step": 724 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 3.8335933685302734, + "learning_rate": 3.336851171713447e-06, + "loss": 0.2195, + "step": 725 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 3.9054670333862305, + "learning_rate": 3.3328494868551444e-06, + "loss": 0.2602, + "step": 726 + }, + { + "epoch": 3.92972972972973, + "grad_norm": 3.1380631923675537, + "learning_rate": 3.3288454002750046e-06, + "loss": 0.1561, + "step": 727 + }, + { + "epoch": 3.935135135135135, + "grad_norm": 4.304198741912842, + "learning_rate": 3.3248389235197764e-06, + "loss": 0.4469, + "step": 728 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 3.3321573734283447, + "learning_rate": 3.3208300681430967e-06, + "loss": 0.2246, + "step": 729 + }, + { + "epoch": 3.945945945945946, + "grad_norm": 3.89400315284729, + "learning_rate": 3.3168188457054656e-06, + "loss": 0.2743, + "step": 730 + }, + { + "epoch": 3.9513513513513514, + "grad_norm": 3.393209934234619, + "learning_rate": 3.312805267774209e-06, + "loss": 0.551, + "step": 731 + }, + { + "epoch": 3.9567567567567568, + "grad_norm": 3.711652994155884, + "learning_rate": 3.3087893459234423e-06, + "loss": 0.3522, + "step": 732 + }, + { + "epoch": 3.962162162162162, + "grad_norm": 3.6701200008392334, + "learning_rate": 3.304771091734043e-06, + "loss": 0.3084, + "step": 733 + }, + { + "epoch": 3.9675675675675675, + "grad_norm": 3.1742889881134033, + "learning_rate": 3.300750516793614e-06, + "loss": 0.3406, + "step": 734 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 4.000397682189941, + "learning_rate": 3.2967276326964504e-06, + "loss": 0.3463, + "step": 735 + }, + { + "epoch": 3.9783783783783786, + "grad_norm": 3.7932708263397217, + "learning_rate": 3.2927024510435057e-06, + "loss": 0.3758, + "step": 736 + }, + { + "epoch": 3.983783783783784, + "grad_norm": 3.6258292198181152, + "learning_rate": 3.2886749834423587e-06, + "loss": 0.3328, + "step": 737 + }, + { + "epoch": 3.9891891891891893, + "grad_norm": 4.628194332122803, + "learning_rate": 3.284645241507183e-06, + "loss": 0.6213, + "step": 738 + }, + { + "epoch": 3.9945945945945946, + "grad_norm": 4.173697471618652, + "learning_rate": 3.280613236858707e-06, + "loss": 0.2463, + "step": 739 + }, + { + "epoch": 4.0, + "grad_norm": 2.9315719604492188, + "learning_rate": 3.2765789811241865e-06, + "loss": 0.3501, + "step": 740 + }, + { + "epoch": 4.005405405405406, + "grad_norm": 3.7292938232421875, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1753, + "step": 741 + }, + { + "epoch": 4.010810810810811, + "grad_norm": 3.627298593521118, + "learning_rate": 3.2685037629384587e-06, + "loss": 0.0722, + "step": 742 + }, + { + "epoch": 4.0162162162162165, + "grad_norm": 3.7558975219726562, + "learning_rate": 3.264462823774085e-06, + "loss": 0.2475, + "step": 743 + }, + { + "epoch": 4.021621621621621, + "grad_norm": 2.991217851638794, + "learning_rate": 3.260419680097268e-06, + "loss": 0.1163, + "step": 744 + }, + { + "epoch": 4.027027027027027, + "grad_norm": 3.315901517868042, + "learning_rate": 3.2563743435673855e-06, + "loss": 0.1325, + "step": 745 + }, + { + "epoch": 4.032432432432432, + "grad_norm": 2.9405429363250732, + "learning_rate": 3.252326825850139e-06, + "loss": 0.0466, + "step": 746 + }, + { + "epoch": 4.037837837837838, + "grad_norm": 4.078726291656494, + "learning_rate": 3.2482771386175173e-06, + "loss": 0.1861, + "step": 747 + }, + { + "epoch": 4.043243243243243, + "grad_norm": 3.6752545833587646, + "learning_rate": 3.24422529354777e-06, + "loss": 0.1637, + "step": 748 + }, + { + "epoch": 4.048648648648649, + "grad_norm": 4.471213340759277, + "learning_rate": 3.2401713023253646e-06, + "loss": 0.1379, + "step": 749 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 4.609938144683838, + "learning_rate": 3.2361151766409628e-06, + "loss": 0.1099, + "step": 750 + }, + { + "epoch": 4.059459459459459, + "grad_norm": 3.7480030059814453, + "learning_rate": 3.232056928191376e-06, + "loss": 0.1422, + "step": 751 + }, + { + "epoch": 4.064864864864865, + "grad_norm": 4.23753547668457, + "learning_rate": 3.2279965686795424e-06, + "loss": 0.2716, + "step": 752 + }, + { + "epoch": 4.07027027027027, + "grad_norm": 4.59039306640625, + "learning_rate": 3.2239341098144833e-06, + "loss": 0.3849, + "step": 753 + }, + { + "epoch": 4.075675675675676, + "grad_norm": 2.9332475662231445, + "learning_rate": 3.219869563311277e-06, + "loss": 0.0768, + "step": 754 + }, + { + "epoch": 4.081081081081081, + "grad_norm": 3.8387272357940674, + "learning_rate": 3.2158029408910213e-06, + "loss": 0.112, + "step": 755 + }, + { + "epoch": 4.0864864864864865, + "grad_norm": 2.5676164627075195, + "learning_rate": 3.2117342542807995e-06, + "loss": 0.1054, + "step": 756 + }, + { + "epoch": 4.091891891891892, + "grad_norm": 3.4695913791656494, + "learning_rate": 3.207663515213648e-06, + "loss": 0.1754, + "step": 757 + }, + { + "epoch": 4.097297297297297, + "grad_norm": 3.531060218811035, + "learning_rate": 3.2035907354285234e-06, + "loss": 0.191, + "step": 758 + }, + { + "epoch": 4.102702702702703, + "grad_norm": 3.8944122791290283, + "learning_rate": 3.1995159266702648e-06, + "loss": 0.1083, + "step": 759 + }, + { + "epoch": 4.108108108108108, + "grad_norm": 3.572751998901367, + "learning_rate": 3.1954391006895635e-06, + "loss": 0.0609, + "step": 760 + }, + { + "epoch": 4.113513513513514, + "grad_norm": 3.533867120742798, + "learning_rate": 3.191360269242928e-06, + "loss": 0.049, + "step": 761 + }, + { + "epoch": 4.118918918918919, + "grad_norm": 3.742013454437256, + "learning_rate": 3.18727944409265e-06, + "loss": 0.1642, + "step": 762 + }, + { + "epoch": 4.124324324324324, + "grad_norm": 3.918525457382202, + "learning_rate": 3.1831966370067714e-06, + "loss": 0.1513, + "step": 763 + }, + { + "epoch": 4.12972972972973, + "grad_norm": 4.906899929046631, + "learning_rate": 3.1791118597590467e-06, + "loss": 0.3276, + "step": 764 + }, + { + "epoch": 4.135135135135135, + "grad_norm": 5.704930305480957, + "learning_rate": 3.1750251241289148e-06, + "loss": 0.4011, + "step": 765 + }, + { + "epoch": 4.140540540540541, + "grad_norm": 4.278724193572998, + "learning_rate": 3.1709364419014615e-06, + "loss": 0.2274, + "step": 766 + }, + { + "epoch": 4.145945945945946, + "grad_norm": 3.7831263542175293, + "learning_rate": 3.166845824867384e-06, + "loss": 0.118, + "step": 767 + }, + { + "epoch": 4.151351351351352, + "grad_norm": 3.6355350017547607, + "learning_rate": 3.162753284822962e-06, + "loss": 0.1109, + "step": 768 + }, + { + "epoch": 4.1567567567567565, + "grad_norm": 4.063662052154541, + "learning_rate": 3.1586588335700176e-06, + "loss": 0.1754, + "step": 769 + }, + { + "epoch": 4.162162162162162, + "grad_norm": 3.404348611831665, + "learning_rate": 3.1545624829158873e-06, + "loss": 0.1155, + "step": 770 + }, + { + "epoch": 4.167567567567567, + "grad_norm": 2.7452480792999268, + "learning_rate": 3.1504642446733828e-06, + "loss": 0.0635, + "step": 771 + }, + { + "epoch": 4.172972972972973, + "grad_norm": 2.4755163192749023, + "learning_rate": 3.146364130660761e-06, + "loss": 0.1068, + "step": 772 + }, + { + "epoch": 4.178378378378379, + "grad_norm": 3.0338311195373535, + "learning_rate": 3.142262152701685e-06, + "loss": 0.0637, + "step": 773 + }, + { + "epoch": 4.183783783783784, + "grad_norm": 4.566886901855469, + "learning_rate": 3.138158322625197e-06, + "loss": 0.2703, + "step": 774 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 4.614205360412598, + "learning_rate": 3.1340526522656765e-06, + "loss": 0.2769, + "step": 775 + }, + { + "epoch": 4.194594594594594, + "grad_norm": 3.4197700023651123, + "learning_rate": 3.1299451534628134e-06, + "loss": 0.1192, + "step": 776 + }, + { + "epoch": 4.2, + "grad_norm": 3.2838752269744873, + "learning_rate": 3.1258358380615674e-06, + "loss": 0.1244, + "step": 777 + }, + { + "epoch": 4.205405405405405, + "grad_norm": 4.484423637390137, + "learning_rate": 3.121724717912138e-06, + "loss": 0.2819, + "step": 778 + }, + { + "epoch": 4.210810810810811, + "grad_norm": 2.6898670196533203, + "learning_rate": 3.1176118048699283e-06, + "loss": 0.1018, + "step": 779 + }, + { + "epoch": 4.216216216216216, + "grad_norm": 3.3304710388183594, + "learning_rate": 3.113497110795514e-06, + "loss": 0.1842, + "step": 780 + }, + { + "epoch": 4.221621621621622, + "grad_norm": 3.29425311088562, + "learning_rate": 3.1093806475546046e-06, + "loss": 0.2299, + "step": 781 + }, + { + "epoch": 4.227027027027027, + "grad_norm": 3.0818686485290527, + "learning_rate": 3.1052624270180116e-06, + "loss": 0.1397, + "step": 782 + }, + { + "epoch": 4.232432432432432, + "grad_norm": 4.569559097290039, + "learning_rate": 3.1011424610616153e-06, + "loss": 0.2236, + "step": 783 + }, + { + "epoch": 4.237837837837838, + "grad_norm": 3.2377943992614746, + "learning_rate": 3.097020761566328e-06, + "loss": 0.1417, + "step": 784 + }, + { + "epoch": 4.243243243243243, + "grad_norm": 5.442404270172119, + "learning_rate": 3.092897340418062e-06, + "loss": 0.1317, + "step": 785 + }, + { + "epoch": 4.248648648648649, + "grad_norm": 4.14007568359375, + "learning_rate": 3.088772209507694e-06, + "loss": 0.1869, + "step": 786 + }, + { + "epoch": 4.254054054054054, + "grad_norm": 3.024740695953369, + "learning_rate": 3.0846453807310317e-06, + "loss": 0.0967, + "step": 787 + }, + { + "epoch": 4.2594594594594595, + "grad_norm": 3.463261365890503, + "learning_rate": 3.080516865988778e-06, + "loss": 0.0731, + "step": 788 + }, + { + "epoch": 4.264864864864865, + "grad_norm": 3.398139715194702, + "learning_rate": 3.076386677186498e-06, + "loss": 0.1912, + "step": 789 + }, + { + "epoch": 4.27027027027027, + "grad_norm": 3.934204339981079, + "learning_rate": 3.0722548262345854e-06, + "loss": 0.2133, + "step": 790 + }, + { + "epoch": 4.275675675675676, + "grad_norm": 5.5322041511535645, + "learning_rate": 3.0681213250482255e-06, + "loss": 0.4454, + "step": 791 + }, + { + "epoch": 4.281081081081081, + "grad_norm": 5.381092071533203, + "learning_rate": 3.0639861855473637e-06, + "loss": 0.3645, + "step": 792 + }, + { + "epoch": 4.286486486486487, + "grad_norm": 4.104682445526123, + "learning_rate": 3.05984941965667e-06, + "loss": 0.1331, + "step": 793 + }, + { + "epoch": 4.291891891891892, + "grad_norm": 3.032749652862549, + "learning_rate": 3.055711039305503e-06, + "loss": 0.0863, + "step": 794 + }, + { + "epoch": 4.297297297297297, + "grad_norm": 3.1181957721710205, + "learning_rate": 3.051571056427879e-06, + "loss": 0.1988, + "step": 795 + }, + { + "epoch": 4.302702702702703, + "grad_norm": 4.8824944496154785, + "learning_rate": 3.047429482962433e-06, + "loss": 0.2307, + "step": 796 + }, + { + "epoch": 4.308108108108108, + "grad_norm": 3.5564794540405273, + "learning_rate": 3.0432863308523903e-06, + "loss": 0.1614, + "step": 797 + }, + { + "epoch": 4.313513513513514, + "grad_norm": 2.928267240524292, + "learning_rate": 3.039141612045525e-06, + "loss": 0.0683, + "step": 798 + }, + { + "epoch": 4.318918918918919, + "grad_norm": 2.846242666244507, + "learning_rate": 3.034995338494131e-06, + "loss": 0.1784, + "step": 799 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 2.8273985385894775, + "learning_rate": 3.0308475221549868e-06, + "loss": 0.0451, + "step": 800 + }, + { + "epoch": 4.3297297297297295, + "grad_norm": 3.0229880809783936, + "learning_rate": 3.026698174989316e-06, + "loss": 0.0618, + "step": 801 + }, + { + "epoch": 4.335135135135135, + "grad_norm": 3.555338144302368, + "learning_rate": 3.0225473089627617e-06, + "loss": 0.1529, + "step": 802 + }, + { + "epoch": 4.34054054054054, + "grad_norm": 3.7206318378448486, + "learning_rate": 3.0183949360453442e-06, + "loss": 0.4177, + "step": 803 + }, + { + "epoch": 4.345945945945946, + "grad_norm": 4.038993835449219, + "learning_rate": 3.014241068211428e-06, + "loss": 0.1394, + "step": 804 + }, + { + "epoch": 4.351351351351352, + "grad_norm": 3.723766565322876, + "learning_rate": 3.0100857174396926e-06, + "loss": 0.04, + "step": 805 + }, + { + "epoch": 4.356756756756757, + "grad_norm": 4.745445728302002, + "learning_rate": 3.0059288957130893e-06, + "loss": 0.2705, + "step": 806 + }, + { + "epoch": 4.3621621621621625, + "grad_norm": 3.245249032974243, + "learning_rate": 3.001770615018815e-06, + "loss": 0.2208, + "step": 807 + }, + { + "epoch": 4.367567567567567, + "grad_norm": 4.631863594055176, + "learning_rate": 2.9976108873482725e-06, + "loss": 0.2068, + "step": 808 + }, + { + "epoch": 4.372972972972973, + "grad_norm": 3.4944963455200195, + "learning_rate": 2.9934497246970357e-06, + "loss": 0.1253, + "step": 809 + }, + { + "epoch": 4.378378378378378, + "grad_norm": 3.393252372741699, + "learning_rate": 2.989287139064819e-06, + "loss": 0.1721, + "step": 810 + }, + { + "epoch": 4.383783783783784, + "grad_norm": 3.2354531288146973, + "learning_rate": 2.9851231424554385e-06, + "loss": 0.134, + "step": 811 + }, + { + "epoch": 4.389189189189189, + "grad_norm": 3.8997225761413574, + "learning_rate": 2.9809577468767813e-06, + "loss": 0.0818, + "step": 812 + }, + { + "epoch": 4.394594594594595, + "grad_norm": 3.4745192527770996, + "learning_rate": 2.9767909643407676e-06, + "loss": 0.1797, + "step": 813 + }, + { + "epoch": 4.4, + "grad_norm": 2.8166556358337402, + "learning_rate": 2.9726228068633155e-06, + "loss": 0.145, + "step": 814 + }, + { + "epoch": 4.405405405405405, + "grad_norm": 3.4947283267974854, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.079, + "step": 815 + }, + { + "epoch": 4.410810810810811, + "grad_norm": 3.8058624267578125, + "learning_rate": 2.9642824151675702e-06, + "loss": 0.1763, + "step": 816 + }, + { + "epoch": 4.416216216216216, + "grad_norm": 3.161440134048462, + "learning_rate": 2.9601102050008016e-06, + "loss": 0.2654, + "step": 817 + }, + { + "epoch": 4.421621621621622, + "grad_norm": 2.7620294094085693, + "learning_rate": 2.955936667995578e-06, + "loss": 0.0779, + "step": 818 + }, + { + "epoch": 4.427027027027027, + "grad_norm": 3.2293593883514404, + "learning_rate": 2.9517618161872974e-06, + "loss": 0.0587, + "step": 819 + }, + { + "epoch": 4.4324324324324325, + "grad_norm": 2.753647565841675, + "learning_rate": 2.9475856616151487e-06, + "loss": 0.0835, + "step": 820 + }, + { + "epoch": 4.437837837837838, + "grad_norm": 3.744755744934082, + "learning_rate": 2.9434082163220773e-06, + "loss": 0.1748, + "step": 821 + }, + { + "epoch": 4.443243243243243, + "grad_norm": 3.5458850860595703, + "learning_rate": 2.9392294923547543e-06, + "loss": 0.119, + "step": 822 + }, + { + "epoch": 4.448648648648649, + "grad_norm": 4.037010192871094, + "learning_rate": 2.9350495017635334e-06, + "loss": 0.1535, + "step": 823 + }, + { + "epoch": 4.454054054054054, + "grad_norm": 3.704439401626587, + "learning_rate": 2.9308682566024228e-06, + "loss": 0.2561, + "step": 824 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 2.9537882804870605, + "learning_rate": 2.92668576892905e-06, + "loss": 0.2024, + "step": 825 + }, + { + "epoch": 4.464864864864865, + "grad_norm": 3.1923575401306152, + "learning_rate": 2.9225020508046233e-06, + "loss": 0.0436, + "step": 826 + }, + { + "epoch": 4.47027027027027, + "grad_norm": 3.304884195327759, + "learning_rate": 2.9183171142939002e-06, + "loss": 0.1636, + "step": 827 + }, + { + "epoch": 4.475675675675676, + "grad_norm": 3.5481832027435303, + "learning_rate": 2.9141309714651528e-06, + "loss": 0.0962, + "step": 828 + }, + { + "epoch": 4.481081081081081, + "grad_norm": 4.0650153160095215, + "learning_rate": 2.9099436343901306e-06, + "loss": 0.2129, + "step": 829 + }, + { + "epoch": 4.486486486486487, + "grad_norm": 4.274670124053955, + "learning_rate": 2.9057551151440266e-06, + "loss": 0.2872, + "step": 830 + }, + { + "epoch": 4.491891891891892, + "grad_norm": 4.45655632019043, + "learning_rate": 2.9015654258054433e-06, + "loss": 0.3254, + "step": 831 + }, + { + "epoch": 4.4972972972972975, + "grad_norm": 3.2205746173858643, + "learning_rate": 2.8973745784563596e-06, + "loss": 0.1417, + "step": 832 + }, + { + "epoch": 4.5027027027027025, + "grad_norm": 3.994489908218384, + "learning_rate": 2.8931825851820904e-06, + "loss": 0.2513, + "step": 833 + }, + { + "epoch": 4.508108108108108, + "grad_norm": 2.8250539302825928, + "learning_rate": 2.8889894580712574e-06, + "loss": 0.1785, + "step": 834 + }, + { + "epoch": 4.513513513513513, + "grad_norm": 3.526552200317383, + "learning_rate": 2.884795209215751e-06, + "loss": 0.2853, + "step": 835 + }, + { + "epoch": 4.518918918918919, + "grad_norm": 3.8975565433502197, + "learning_rate": 2.880599850710696e-06, + "loss": 0.2947, + "step": 836 + }, + { + "epoch": 4.524324324324324, + "grad_norm": 2.86104154586792, + "learning_rate": 2.8764033946544197e-06, + "loss": 0.177, + "step": 837 + }, + { + "epoch": 4.52972972972973, + "grad_norm": 3.967454433441162, + "learning_rate": 2.8722058531484105e-06, + "loss": 0.2786, + "step": 838 + }, + { + "epoch": 4.535135135135135, + "grad_norm": 3.9122490882873535, + "learning_rate": 2.86800723829729e-06, + "loss": 0.1881, + "step": 839 + }, + { + "epoch": 4.54054054054054, + "grad_norm": 3.9732089042663574, + "learning_rate": 2.8638075622087747e-06, + "loss": 0.3541, + "step": 840 + }, + { + "epoch": 4.545945945945946, + "grad_norm": 3.7056405544281006, + "learning_rate": 2.8596068369936386e-06, + "loss": 0.3094, + "step": 841 + }, + { + "epoch": 4.551351351351351, + "grad_norm": 3.5056777000427246, + "learning_rate": 2.8554050747656862e-06, + "loss": 0.1162, + "step": 842 + }, + { + "epoch": 4.556756756756757, + "grad_norm": 3.1131439208984375, + "learning_rate": 2.851202287641709e-06, + "loss": 0.1079, + "step": 843 + }, + { + "epoch": 4.562162162162162, + "grad_norm": 3.6517693996429443, + "learning_rate": 2.8469984877414525e-06, + "loss": 0.4462, + "step": 844 + }, + { + "epoch": 4.5675675675675675, + "grad_norm": 3.0627806186676025, + "learning_rate": 2.842793687187588e-06, + "loss": 0.0851, + "step": 845 + }, + { + "epoch": 4.572972972972973, + "grad_norm": 4.0370893478393555, + "learning_rate": 2.8385878981056663e-06, + "loss": 0.1268, + "step": 846 + }, + { + "epoch": 4.578378378378378, + "grad_norm": 3.486156463623047, + "learning_rate": 2.8343811326240944e-06, + "loss": 0.3187, + "step": 847 + }, + { + "epoch": 4.583783783783784, + "grad_norm": 2.4388604164123535, + "learning_rate": 2.830173402874091e-06, + "loss": 0.1315, + "step": 848 + }, + { + "epoch": 4.589189189189189, + "grad_norm": 3.5970475673675537, + "learning_rate": 2.8259647209896573e-06, + "loss": 0.301, + "step": 849 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 3.657775402069092, + "learning_rate": 2.821755099107541e-06, + "loss": 0.1478, + "step": 850 + }, + { + "epoch": 4.6, + "grad_norm": 3.2040653228759766, + "learning_rate": 2.817544549367197e-06, + "loss": 0.2029, + "step": 851 + }, + { + "epoch": 4.605405405405405, + "grad_norm": 2.778747081756592, + "learning_rate": 2.813333083910761e-06, + "loss": 0.0549, + "step": 852 + }, + { + "epoch": 4.610810810810811, + "grad_norm": 3.661921977996826, + "learning_rate": 2.8091207148830046e-06, + "loss": 0.1508, + "step": 853 + }, + { + "epoch": 4.616216216216216, + "grad_norm": 2.7028398513793945, + "learning_rate": 2.8049074544313094e-06, + "loss": 0.1094, + "step": 854 + }, + { + "epoch": 4.621621621621622, + "grad_norm": 3.3319056034088135, + "learning_rate": 2.8006933147056236e-06, + "loss": 0.0799, + "step": 855 + }, + { + "epoch": 4.627027027027027, + "grad_norm": 3.3194944858551025, + "learning_rate": 2.7964783078584336e-06, + "loss": 0.123, + "step": 856 + }, + { + "epoch": 4.632432432432433, + "grad_norm": 2.4618616104125977, + "learning_rate": 2.792262446044725e-06, + "loss": 0.0692, + "step": 857 + }, + { + "epoch": 4.6378378378378375, + "grad_norm": 4.007084846496582, + "learning_rate": 2.788045741421949e-06, + "loss": 0.1596, + "step": 858 + }, + { + "epoch": 4.643243243243243, + "grad_norm": 2.6852214336395264, + "learning_rate": 2.78382820614999e-06, + "loss": 0.047, + "step": 859 + }, + { + "epoch": 4.648648648648649, + "grad_norm": 3.249666690826416, + "learning_rate": 2.779609852391123e-06, + "loss": 0.1561, + "step": 860 + }, + { + "epoch": 4.654054054054054, + "grad_norm": 7.2313337326049805, + "learning_rate": 2.775390692309987e-06, + "loss": 0.2157, + "step": 861 + }, + { + "epoch": 4.65945945945946, + "grad_norm": 3.1866044998168945, + "learning_rate": 2.7711707380735443e-06, + "loss": 0.0782, + "step": 862 + }, + { + "epoch": 4.664864864864865, + "grad_norm": 3.714812755584717, + "learning_rate": 2.766950001851049e-06, + "loss": 0.2994, + "step": 863 + }, + { + "epoch": 4.6702702702702705, + "grad_norm": 3.0355515480041504, + "learning_rate": 2.7627284958140084e-06, + "loss": 0.109, + "step": 864 + }, + { + "epoch": 4.675675675675675, + "grad_norm": 2.8177638053894043, + "learning_rate": 2.7585062321361517e-06, + "loss": 0.2557, + "step": 865 + }, + { + "epoch": 4.681081081081081, + "grad_norm": 3.7162227630615234, + "learning_rate": 2.75428322299339e-06, + "loss": 0.0413, + "step": 866 + }, + { + "epoch": 4.686486486486486, + "grad_norm": 3.008643627166748, + "learning_rate": 2.7500594805637882e-06, + "loss": 0.0402, + "step": 867 + }, + { + "epoch": 4.691891891891892, + "grad_norm": 3.1683881282806396, + "learning_rate": 2.745835017027522e-06, + "loss": 0.1481, + "step": 868 + }, + { + "epoch": 4.697297297297297, + "grad_norm": 3.2899327278137207, + "learning_rate": 2.74160984456685e-06, + "loss": 0.2242, + "step": 869 + }, + { + "epoch": 4.702702702702703, + "grad_norm": 5.386324882507324, + "learning_rate": 2.737383975366071e-06, + "loss": 0.4693, + "step": 870 + }, + { + "epoch": 4.708108108108108, + "grad_norm": 3.0007741451263428, + "learning_rate": 2.7331574216114963e-06, + "loss": 0.1353, + "step": 871 + }, + { + "epoch": 4.713513513513513, + "grad_norm": 2.7533962726593018, + "learning_rate": 2.728930195491411e-06, + "loss": 0.157, + "step": 872 + }, + { + "epoch": 4.718918918918919, + "grad_norm": 3.349351167678833, + "learning_rate": 2.724702309196038e-06, + "loss": 0.1863, + "step": 873 + }, + { + "epoch": 4.724324324324324, + "grad_norm": 3.2562623023986816, + "learning_rate": 2.720473774917505e-06, + "loss": 0.2874, + "step": 874 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 3.4865262508392334, + "learning_rate": 2.716244604849807e-06, + "loss": 0.1021, + "step": 875 + }, + { + "epoch": 4.735135135135135, + "grad_norm": 3.793647289276123, + "learning_rate": 2.7120148111887732e-06, + "loss": 0.1046, + "step": 876 + }, + { + "epoch": 4.7405405405405405, + "grad_norm": 3.8841137886047363, + "learning_rate": 2.707784406132032e-06, + "loss": 0.0971, + "step": 877 + }, + { + "epoch": 4.745945945945946, + "grad_norm": 3.45615816116333, + "learning_rate": 2.703553401878972e-06, + "loss": 0.0507, + "step": 878 + }, + { + "epoch": 4.751351351351351, + "grad_norm": 3.578495502471924, + "learning_rate": 2.6993218106307146e-06, + "loss": 0.0616, + "step": 879 + }, + { + "epoch": 4.756756756756757, + "grad_norm": 4.271491527557373, + "learning_rate": 2.6950896445900685e-06, + "loss": 0.0908, + "step": 880 + }, + { + "epoch": 4.762162162162162, + "grad_norm": 3.889042615890503, + "learning_rate": 2.690856915961504e-06, + "loss": 0.2426, + "step": 881 + }, + { + "epoch": 4.767567567567568, + "grad_norm": 3.8519232273101807, + "learning_rate": 2.686623636951112e-06, + "loss": 0.1881, + "step": 882 + }, + { + "epoch": 4.772972972972973, + "grad_norm": 3.819518804550171, + "learning_rate": 2.6823898197665703e-06, + "loss": 0.1385, + "step": 883 + }, + { + "epoch": 4.778378378378378, + "grad_norm": 4.091328144073486, + "learning_rate": 2.6781554766171104e-06, + "loss": 0.2913, + "step": 884 + }, + { + "epoch": 4.783783783783784, + "grad_norm": 2.60793399810791, + "learning_rate": 2.673920619713478e-06, + "loss": 0.0874, + "step": 885 + }, + { + "epoch": 4.789189189189189, + "grad_norm": 4.59322452545166, + "learning_rate": 2.6696852612679024e-06, + "loss": 0.2703, + "step": 886 + }, + { + "epoch": 4.794594594594595, + "grad_norm": 3.4631619453430176, + "learning_rate": 2.6654494134940586e-06, + "loss": 0.121, + "step": 887 + }, + { + "epoch": 4.8, + "grad_norm": 3.8556058406829834, + "learning_rate": 2.6612130886070313e-06, + "loss": 0.1853, + "step": 888 + }, + { + "epoch": 4.805405405405406, + "grad_norm": 2.932152271270752, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.0533, + "step": 889 + }, + { + "epoch": 4.8108108108108105, + "grad_norm": 4.647441387176514, + "learning_rate": 2.652739056360618e-06, + "loss": 0.3178, + "step": 890 + }, + { + "epoch": 4.816216216216216, + "grad_norm": 4.682106018066406, + "learning_rate": 2.648501373438142e-06, + "loss": 0.1735, + "step": 891 + }, + { + "epoch": 4.821621621621622, + "grad_norm": 3.1454825401306152, + "learning_rate": 2.644263262276234e-06, + "loss": 0.062, + "step": 892 + }, + { + "epoch": 4.827027027027027, + "grad_norm": 3.579653739929199, + "learning_rate": 2.640024735096507e-06, + "loss": 0.1336, + "step": 893 + }, + { + "epoch": 4.832432432432433, + "grad_norm": 2.558265447616577, + "learning_rate": 2.6357858041217733e-06, + "loss": 0.1404, + "step": 894 + }, + { + "epoch": 4.837837837837838, + "grad_norm": 2.3879470825195312, + "learning_rate": 2.6315464815760104e-06, + "loss": 0.0373, + "step": 895 + }, + { + "epoch": 4.8432432432432435, + "grad_norm": 4.418992042541504, + "learning_rate": 2.6273067796843242e-06, + "loss": 0.3068, + "step": 896 + }, + { + "epoch": 4.848648648648648, + "grad_norm": 3.08585786819458, + "learning_rate": 2.6230667106729157e-06, + "loss": 0.2221, + "step": 897 + }, + { + "epoch": 4.854054054054054, + "grad_norm": 2.9488885402679443, + "learning_rate": 2.618826286769043e-06, + "loss": 0.1431, + "step": 898 + }, + { + "epoch": 4.859459459459459, + "grad_norm": 4.123927116394043, + "learning_rate": 2.614585520200989e-06, + "loss": 0.196, + "step": 899 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 4.289125919342041, + "learning_rate": 2.6103444231980233e-06, + "loss": 0.2509, + "step": 900 + }, + { + "epoch": 4.87027027027027, + "grad_norm": 3.0358095169067383, + "learning_rate": 2.606103007990371e-06, + "loss": 0.0747, + "step": 901 + }, + { + "epoch": 4.875675675675676, + "grad_norm": 3.6471376419067383, + "learning_rate": 2.601861286809172e-06, + "loss": 0.0494, + "step": 902 + }, + { + "epoch": 4.881081081081081, + "grad_norm": 3.424712896347046, + "learning_rate": 2.5976192718864497e-06, + "loss": 0.0901, + "step": 903 + }, + { + "epoch": 4.886486486486486, + "grad_norm": 4.047586441040039, + "learning_rate": 2.593376975455075e-06, + "loss": 0.0465, + "step": 904 + }, + { + "epoch": 4.891891891891892, + "grad_norm": 4.448032379150391, + "learning_rate": 2.5891344097487294e-06, + "loss": 0.0616, + "step": 905 + }, + { + "epoch": 4.897297297297297, + "grad_norm": 3.3522684574127197, + "learning_rate": 2.584891587001872e-06, + "loss": 0.087, + "step": 906 + }, + { + "epoch": 4.902702702702703, + "grad_norm": 2.979238986968994, + "learning_rate": 2.580648519449704e-06, + "loss": 0.053, + "step": 907 + }, + { + "epoch": 4.908108108108108, + "grad_norm": 6.049450397491455, + "learning_rate": 2.5764052193281287e-06, + "loss": 0.2707, + "step": 908 + }, + { + "epoch": 4.9135135135135135, + "grad_norm": 6.647163391113281, + "learning_rate": 2.5721616988737254e-06, + "loss": 0.3679, + "step": 909 + }, + { + "epoch": 4.918918918918919, + "grad_norm": 3.764979839324951, + "learning_rate": 2.567917970323704e-06, + "loss": 0.1929, + "step": 910 + }, + { + "epoch": 4.924324324324324, + "grad_norm": 3.5592362880706787, + "learning_rate": 2.5636740459158776e-06, + "loss": 0.2461, + "step": 911 + }, + { + "epoch": 4.92972972972973, + "grad_norm": 4.4554762840271, + "learning_rate": 2.559429937888624e-06, + "loss": 0.2484, + "step": 912 + }, + { + "epoch": 4.935135135135135, + "grad_norm": 3.358375072479248, + "learning_rate": 2.5551856584808483e-06, + "loss": 0.1886, + "step": 913 + }, + { + "epoch": 4.940540540540541, + "grad_norm": 3.5831756591796875, + "learning_rate": 2.5509412199319515e-06, + "loss": 0.1789, + "step": 914 + }, + { + "epoch": 4.945945945945946, + "grad_norm": 2.4555728435516357, + "learning_rate": 2.5466966344817927e-06, + "loss": 0.1072, + "step": 915 + }, + { + "epoch": 4.951351351351351, + "grad_norm": 4.581109046936035, + "learning_rate": 2.542451914370656e-06, + "loss": 0.2624, + "step": 916 + }, + { + "epoch": 4.956756756756757, + "grad_norm": 2.9763975143432617, + "learning_rate": 2.538207071839213e-06, + "loss": 0.0639, + "step": 917 + }, + { + "epoch": 4.962162162162162, + "grad_norm": 3.516282796859741, + "learning_rate": 2.533962119128487e-06, + "loss": 0.1281, + "step": 918 + }, + { + "epoch": 4.967567567567568, + "grad_norm": 3.0369791984558105, + "learning_rate": 2.529717068479821e-06, + "loss": 0.1771, + "step": 919 + }, + { + "epoch": 4.972972972972973, + "grad_norm": 2.998521327972412, + "learning_rate": 2.5254719321348392e-06, + "loss": 0.2582, + "step": 920 + }, + { + "epoch": 4.978378378378379, + "grad_norm": 3.002901792526245, + "learning_rate": 2.5212267223354143e-06, + "loss": 0.3016, + "step": 921 + }, + { + "epoch": 4.9837837837837835, + "grad_norm": 3.564932346343994, + "learning_rate": 2.5169814513236296e-06, + "loss": 0.2775, + "step": 922 + }, + { + "epoch": 4.989189189189189, + "grad_norm": 3.726227283477783, + "learning_rate": 2.5127361313417447e-06, + "loss": 0.1246, + "step": 923 + }, + { + "epoch": 4.994594594594595, + "grad_norm": 4.766391754150391, + "learning_rate": 2.508490774632162e-06, + "loss": 0.1732, + "step": 924 + }, + { + "epoch": 5.0, + "grad_norm": 2.9859752655029297, + "learning_rate": 2.5042453934373874e-06, + "loss": 0.1107, + "step": 925 + } + ], + "logging_steps": 1, + "max_steps": 1850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.495984431173468e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl b/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5c6f520e97e534cfb4519495aecb59859b577893 --- /dev/null +++ b/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl @@ -0,0 +1,1290 @@ +{"current_steps": 1, "total_steps": 1850, "loss": 2.9165, "lr": 5e-06, "epoch": 0.005405405405405406, "percentage": 0.05, "elapsed_time": "0:00:03", "remaining_time": "1:56:42"} +{"current_steps": 2, "total_steps": 1850, "loss": 1.9314, "lr": 4.999996395324314e-06, "epoch": 0.010810810810810811, "percentage": 0.11, "elapsed_time": "0:00:07", "remaining_time": "1:49:49"} +{"current_steps": 3, "total_steps": 1850, "loss": 1.5709, "lr": 4.99998558130765e-06, "epoch": 0.016216216216216217, "percentage": 0.16, "elapsed_time": "0:00:13", "remaining_time": "2:15:03"} +{"current_steps": 4, "total_steps": 1850, "loss": 0.8099, "lr": 4.999967557981192e-06, "epoch": 0.021621621621621623, "percentage": 0.22, "elapsed_time": "0:00:15", "remaining_time": "2:02:01"} +{"current_steps": 5, "total_steps": 1850, "loss": 0.9021, "lr": 4.999942325396917e-06, "epoch": 0.02702702702702703, "percentage": 0.27, "elapsed_time": "0:00:18", "remaining_time": "1:54:34"} +{"current_steps": 6, "total_steps": 1850, "loss": 1.7972, "lr": 4.999909883627588e-06, "epoch": 0.032432432432432434, "percentage": 0.32, "elapsed_time": "0:00:23", "remaining_time": "2:00:53"} +{"current_steps": 7, "total_steps": 1850, "loss": 1.4306, "lr": 4.999870232766757e-06, "epoch": 0.03783783783783784, "percentage": 0.38, "elapsed_time": "0:00:26", "remaining_time": "1:55:27"} +{"current_steps": 8, "total_steps": 1850, "loss": 1.051, "lr": 4.9998233729287696e-06, "epoch": 0.043243243243243246, "percentage": 0.43, "elapsed_time": "0:00:29", "remaining_time": "1:53:49"} +{"current_steps": 9, "total_steps": 1850, "loss": 0.8089, "lr": 4.999769304248755e-06, "epoch": 0.04864864864864865, "percentage": 0.49, "elapsed_time": "0:00:31", "remaining_time": "1:49:02"} +{"current_steps": 10, "total_steps": 1850, "loss": 1.0999, "lr": 4.9997080268826344e-06, "epoch": 0.05405405405405406, "percentage": 0.54, "elapsed_time": "0:00:35", "remaining_time": "1:50:03"} +{"current_steps": 11, "total_steps": 1850, "loss": 1.2831, "lr": 4.9996395410071165e-06, "epoch": 0.05945945945945946, "percentage": 0.59, "elapsed_time": "0:00:39", "remaining_time": "1:49:01"} +{"current_steps": 12, "total_steps": 1850, "loss": 1.2874, "lr": 4.999563846819696e-06, "epoch": 0.06486486486486487, "percentage": 0.65, "elapsed_time": "0:00:44", "remaining_time": "1:54:29"} +{"current_steps": 13, "total_steps": 1850, "loss": 0.96, "lr": 4.999480944538655e-06, "epoch": 0.07027027027027027, "percentage": 0.7, "elapsed_time": "0:00:46", "remaining_time": "1:48:43"} +{"current_steps": 14, "total_steps": 1850, "loss": 0.9869, "lr": 4.999390834403063e-06, "epoch": 0.07567567567567568, "percentage": 0.76, "elapsed_time": "0:00:50", "remaining_time": "1:49:43"} +{"current_steps": 15, "total_steps": 1850, "loss": 0.9293, "lr": 4.999293516672773e-06, "epoch": 0.08108108108108109, "percentage": 0.81, "elapsed_time": "0:00:51", "remaining_time": "1:45:18"} +{"current_steps": 16, "total_steps": 1850, "loss": 0.8914, "lr": 4.9991889916284255e-06, "epoch": 0.08648648648648649, "percentage": 0.86, "elapsed_time": "0:00:52", "remaining_time": "1:40:36"} +{"current_steps": 17, "total_steps": 1850, "loss": 1.0176, "lr": 4.999077259571442e-06, "epoch": 0.0918918918918919, "percentage": 0.92, "elapsed_time": "0:00:53", "remaining_time": "1:36:26"} +{"current_steps": 18, "total_steps": 1850, "loss": 1.0259, "lr": 4.998958320824031e-06, "epoch": 0.0972972972972973, "percentage": 0.97, "elapsed_time": "0:00:58", "remaining_time": "1:39:01"} +{"current_steps": 19, "total_steps": 1850, "loss": 1.3356, "lr": 4.998832175729179e-06, "epoch": 0.10270270270270271, "percentage": 1.03, "elapsed_time": "0:01:01", "remaining_time": "1:39:12"} +{"current_steps": 20, "total_steps": 1850, "loss": 1.4486, "lr": 4.998698824650656e-06, "epoch": 0.10810810810810811, "percentage": 1.08, "elapsed_time": "0:01:04", "remaining_time": "1:39:07"} +{"current_steps": 21, "total_steps": 1850, "loss": 0.8372, "lr": 4.998558267973014e-06, "epoch": 0.11351351351351352, "percentage": 1.14, "elapsed_time": "0:01:08", "remaining_time": "1:38:47"} +{"current_steps": 22, "total_steps": 1850, "loss": 0.7931, "lr": 4.998410506101579e-06, "epoch": 0.11891891891891893, "percentage": 1.19, "elapsed_time": "0:01:09", "remaining_time": "1:36:45"} +{"current_steps": 23, "total_steps": 1850, "loss": 1.3022, "lr": 4.9982555394624595e-06, "epoch": 0.12432432432432433, "percentage": 1.24, "elapsed_time": "0:01:15", "remaining_time": "1:40:32"} +{"current_steps": 24, "total_steps": 1850, "loss": 0.9739, "lr": 4.998093368502539e-06, "epoch": 0.12972972972972974, "percentage": 1.3, "elapsed_time": "0:01:17", "remaining_time": "1:38:20"} +{"current_steps": 25, "total_steps": 1850, "loss": 1.1154, "lr": 4.9979239936894765e-06, "epoch": 0.13513513513513514, "percentage": 1.35, "elapsed_time": "0:01:24", "remaining_time": "1:42:18"} +{"current_steps": 26, "total_steps": 1850, "loss": 0.7543, "lr": 4.997747415511705e-06, "epoch": 0.14054054054054055, "percentage": 1.41, "elapsed_time": "0:01:27", "remaining_time": "1:42:48"} +{"current_steps": 27, "total_steps": 1850, "loss": 0.7278, "lr": 4.997563634478428e-06, "epoch": 0.14594594594594595, "percentage": 1.46, "elapsed_time": "0:01:29", "remaining_time": "1:41:05"} +{"current_steps": 28, "total_steps": 1850, "loss": 0.8167, "lr": 4.997372651119626e-06, "epoch": 0.15135135135135136, "percentage": 1.51, "elapsed_time": "0:01:33", "remaining_time": "1:41:49"} +{"current_steps": 29, "total_steps": 1850, "loss": 0.8031, "lr": 4.997174465986044e-06, "epoch": 0.15675675675675677, "percentage": 1.57, "elapsed_time": "0:01:37", "remaining_time": "1:41:53"} +{"current_steps": 30, "total_steps": 1850, "loss": 0.689, "lr": 4.996969079649196e-06, "epoch": 0.16216216216216217, "percentage": 1.62, "elapsed_time": "0:01:41", "remaining_time": "1:43:06"} +{"current_steps": 31, "total_steps": 1850, "loss": 0.8059, "lr": 4.996756492701362e-06, "epoch": 0.16756756756756758, "percentage": 1.68, "elapsed_time": "0:01:43", "remaining_time": "1:41:03"} +{"current_steps": 32, "total_steps": 1850, "loss": 0.9658, "lr": 4.996536705755591e-06, "epoch": 0.17297297297297298, "percentage": 1.73, "elapsed_time": "0:01:48", "remaining_time": "1:42:46"} +{"current_steps": 33, "total_steps": 1850, "loss": 0.8349, "lr": 4.996309719445687e-06, "epoch": 0.1783783783783784, "percentage": 1.78, "elapsed_time": "0:01:50", "remaining_time": "1:40:59"} +{"current_steps": 34, "total_steps": 1850, "loss": 0.8287, "lr": 4.996075534426223e-06, "epoch": 0.1837837837837838, "percentage": 1.84, "elapsed_time": "0:01:54", "remaining_time": "1:41:33"} +{"current_steps": 35, "total_steps": 1850, "loss": 1.1211, "lr": 4.995834151372526e-06, "epoch": 0.1891891891891892, "percentage": 1.89, "elapsed_time": "0:01:57", "remaining_time": "1:41:55"} +{"current_steps": 36, "total_steps": 1850, "loss": 1.0841, "lr": 4.995585570980685e-06, "epoch": 0.1945945945945946, "percentage": 1.95, "elapsed_time": "0:02:00", "remaining_time": "1:40:51"} +{"current_steps": 37, "total_steps": 1850, "loss": 0.6182, "lr": 4.995329793967537e-06, "epoch": 0.2, "percentage": 2.0, "elapsed_time": "0:02:01", "remaining_time": "1:38:49"} +{"current_steps": 38, "total_steps": 1850, "loss": 0.7647, "lr": 4.9950668210706795e-06, "epoch": 0.20540540540540542, "percentage": 2.05, "elapsed_time": "0:02:02", "remaining_time": "1:37:12"} +{"current_steps": 39, "total_steps": 1850, "loss": 0.8691, "lr": 4.994796653048457e-06, "epoch": 0.21081081081081082, "percentage": 2.11, "elapsed_time": "0:02:06", "remaining_time": "1:38:09"} +{"current_steps": 40, "total_steps": 1850, "loss": 1.0404, "lr": 4.994519290679965e-06, "epoch": 0.21621621621621623, "percentage": 2.16, "elapsed_time": "0:02:10", "remaining_time": "1:38:08"} +{"current_steps": 41, "total_steps": 1850, "loss": 1.1877, "lr": 4.994234734765043e-06, "epoch": 0.22162162162162163, "percentage": 2.22, "elapsed_time": "0:02:15", "remaining_time": "1:39:50"} +{"current_steps": 42, "total_steps": 1850, "loss": 0.959, "lr": 4.993942986124278e-06, "epoch": 0.22702702702702704, "percentage": 2.27, "elapsed_time": "0:02:19", "remaining_time": "1:40:25"} +{"current_steps": 43, "total_steps": 1850, "loss": 0.9249, "lr": 4.9936440455989975e-06, "epoch": 0.23243243243243245, "percentage": 2.32, "elapsed_time": "0:02:22", "remaining_time": "1:40:08"} +{"current_steps": 44, "total_steps": 1850, "loss": 0.6899, "lr": 4.993337914051266e-06, "epoch": 0.23783783783783785, "percentage": 2.38, "elapsed_time": "0:02:25", "remaining_time": "1:39:23"} +{"current_steps": 45, "total_steps": 1850, "loss": 0.9075, "lr": 4.99302459236389e-06, "epoch": 0.24324324324324326, "percentage": 2.43, "elapsed_time": "0:02:31", "remaining_time": "1:41:02"} +{"current_steps": 46, "total_steps": 1850, "loss": 0.785, "lr": 4.992704081440407e-06, "epoch": 0.24864864864864866, "percentage": 2.49, "elapsed_time": "0:02:32", "remaining_time": "1:39:44"} +{"current_steps": 47, "total_steps": 1850, "loss": 1.008, "lr": 4.992376382205088e-06, "epoch": 0.25405405405405407, "percentage": 2.54, "elapsed_time": "0:02:35", "remaining_time": "1:39:41"} +{"current_steps": 48, "total_steps": 1850, "loss": 0.7751, "lr": 4.992041495602932e-06, "epoch": 0.2594594594594595, "percentage": 2.59, "elapsed_time": "0:02:38", "remaining_time": "1:39:07"} +{"current_steps": 49, "total_steps": 1850, "loss": 0.9022, "lr": 4.991699422599664e-06, "epoch": 0.2648648648648649, "percentage": 2.65, "elapsed_time": "0:02:40", "remaining_time": "1:38:33"} +{"current_steps": 50, "total_steps": 1850, "loss": 0.8801, "lr": 4.991350164181735e-06, "epoch": 0.2702702702702703, "percentage": 2.7, "elapsed_time": "0:02:44", "remaining_time": "1:38:37"} +{"current_steps": 51, "total_steps": 1850, "loss": 0.7045, "lr": 4.990993721356317e-06, "epoch": 0.2756756756756757, "percentage": 2.76, "elapsed_time": "0:02:46", "remaining_time": "1:37:42"} +{"current_steps": 52, "total_steps": 1850, "loss": 0.7312, "lr": 4.990630095151296e-06, "epoch": 0.2810810810810811, "percentage": 2.81, "elapsed_time": "0:02:48", "remaining_time": "1:37:06"} +{"current_steps": 53, "total_steps": 1850, "loss": 0.9609, "lr": 4.9902592866152765e-06, "epoch": 0.2864864864864865, "percentage": 2.86, "elapsed_time": "0:02:51", "remaining_time": "1:36:53"} +{"current_steps": 54, "total_steps": 1850, "loss": 0.5753, "lr": 4.989881296817575e-06, "epoch": 0.2918918918918919, "percentage": 2.92, "elapsed_time": "0:02:53", "remaining_time": "1:36:16"} +{"current_steps": 55, "total_steps": 1850, "loss": 0.5118, "lr": 4.989496126848215e-06, "epoch": 0.2972972972972973, "percentage": 2.97, "elapsed_time": "0:02:55", "remaining_time": "1:35:36"} +{"current_steps": 56, "total_steps": 1850, "loss": 1.1261, "lr": 4.989103777817928e-06, "epoch": 0.3027027027027027, "percentage": 3.03, "elapsed_time": "0:03:02", "remaining_time": "1:37:15"} +{"current_steps": 57, "total_steps": 1850, "loss": 0.7823, "lr": 4.988704250858145e-06, "epoch": 0.3081081081081081, "percentage": 3.08, "elapsed_time": "0:03:04", "remaining_time": "1:36:57"} +{"current_steps": 58, "total_steps": 1850, "loss": 0.6019, "lr": 4.988297547121e-06, "epoch": 0.31351351351351353, "percentage": 3.14, "elapsed_time": "0:03:09", "remaining_time": "1:37:39"} +{"current_steps": 59, "total_steps": 1850, "loss": 0.825, "lr": 4.98788366777932e-06, "epoch": 0.31891891891891894, "percentage": 3.19, "elapsed_time": "0:03:11", "remaining_time": "1:36:47"} +{"current_steps": 60, "total_steps": 1850, "loss": 0.7667, "lr": 4.987462614026625e-06, "epoch": 0.32432432432432434, "percentage": 3.24, "elapsed_time": "0:03:13", "remaining_time": "1:36:13"} +{"current_steps": 61, "total_steps": 1850, "loss": 0.8051, "lr": 4.987034387077126e-06, "epoch": 0.32972972972972975, "percentage": 3.3, "elapsed_time": "0:03:16", "remaining_time": "1:36:16"} +{"current_steps": 62, "total_steps": 1850, "loss": 0.6895, "lr": 4.986598988165718e-06, "epoch": 0.33513513513513515, "percentage": 3.35, "elapsed_time": "0:03:19", "remaining_time": "1:36:07"} +{"current_steps": 63, "total_steps": 1850, "loss": 0.9268, "lr": 4.9861564185479785e-06, "epoch": 0.34054054054054056, "percentage": 3.41, "elapsed_time": "0:03:25", "remaining_time": "1:36:57"} +{"current_steps": 64, "total_steps": 1850, "loss": 0.9854, "lr": 4.985706679500163e-06, "epoch": 0.34594594594594597, "percentage": 3.46, "elapsed_time": "0:03:28", "remaining_time": "1:36:45"} +{"current_steps": 65, "total_steps": 1850, "loss": 0.8083, "lr": 4.9852497723192025e-06, "epoch": 0.35135135135135137, "percentage": 3.51, "elapsed_time": "0:03:29", "remaining_time": "1:35:40"} +{"current_steps": 66, "total_steps": 1850, "loss": 0.9098, "lr": 4.9847856983227e-06, "epoch": 0.3567567567567568, "percentage": 3.57, "elapsed_time": "0:03:31", "remaining_time": "1:35:11"} +{"current_steps": 67, "total_steps": 1850, "loss": 0.8881, "lr": 4.984314458848923e-06, "epoch": 0.3621621621621622, "percentage": 3.62, "elapsed_time": "0:03:34", "remaining_time": "1:35:02"} +{"current_steps": 68, "total_steps": 1850, "loss": 0.9877, "lr": 4.983836055256804e-06, "epoch": 0.3675675675675676, "percentage": 3.68, "elapsed_time": "0:03:38", "remaining_time": "1:35:16"} +{"current_steps": 69, "total_steps": 1850, "loss": 0.8282, "lr": 4.983350488925935e-06, "epoch": 0.372972972972973, "percentage": 3.73, "elapsed_time": "0:03:40", "remaining_time": "1:34:50"} +{"current_steps": 70, "total_steps": 1850, "loss": 1.1756, "lr": 4.982857761256564e-06, "epoch": 0.3783783783783784, "percentage": 3.78, "elapsed_time": "0:03:44", "remaining_time": "1:35:15"} +{"current_steps": 71, "total_steps": 1850, "loss": 0.8114, "lr": 4.982357873669589e-06, "epoch": 0.3837837837837838, "percentage": 3.84, "elapsed_time": "0:03:46", "remaining_time": "1:34:40"} +{"current_steps": 72, "total_steps": 1850, "loss": 0.6763, "lr": 4.981850827606556e-06, "epoch": 0.3891891891891892, "percentage": 3.89, "elapsed_time": "0:03:48", "remaining_time": "1:34:12"} +{"current_steps": 73, "total_steps": 1850, "loss": 0.9372, "lr": 4.981336624529655e-06, "epoch": 0.3945945945945946, "percentage": 3.95, "elapsed_time": "0:03:50", "remaining_time": "1:33:40"} +{"current_steps": 74, "total_steps": 1850, "loss": 1.0155, "lr": 4.980815265921714e-06, "epoch": 0.4, "percentage": 4.0, "elapsed_time": "0:03:53", "remaining_time": "1:33:34"} +{"current_steps": 75, "total_steps": 1850, "loss": 0.949, "lr": 4.980286753286196e-06, "epoch": 0.40540540540540543, "percentage": 4.05, "elapsed_time": "0:03:59", "remaining_time": "1:34:33"} +{"current_steps": 76, "total_steps": 1850, "loss": 1.0134, "lr": 4.979751088147192e-06, "epoch": 0.41081081081081083, "percentage": 4.11, "elapsed_time": "0:04:02", "remaining_time": "1:34:21"} +{"current_steps": 77, "total_steps": 1850, "loss": 0.9722, "lr": 4.979208272049425e-06, "epoch": 0.41621621621621624, "percentage": 4.16, "elapsed_time": "0:04:04", "remaining_time": "1:33:55"} +{"current_steps": 78, "total_steps": 1850, "loss": 1.2259, "lr": 4.978658306558235e-06, "epoch": 0.42162162162162165, "percentage": 4.22, "elapsed_time": "0:04:08", "remaining_time": "1:34:03"} +{"current_steps": 79, "total_steps": 1850, "loss": 0.834, "lr": 4.978101193259578e-06, "epoch": 0.42702702702702705, "percentage": 4.27, "elapsed_time": "0:04:09", "remaining_time": "1:33:12"} +{"current_steps": 80, "total_steps": 1850, "loss": 0.6151, "lr": 4.977536933760025e-06, "epoch": 0.43243243243243246, "percentage": 4.32, "elapsed_time": "0:04:11", "remaining_time": "1:32:49"} +{"current_steps": 81, "total_steps": 1850, "loss": 1.0475, "lr": 4.976965529686755e-06, "epoch": 0.43783783783783786, "percentage": 4.38, "elapsed_time": "0:04:15", "remaining_time": "1:32:49"} +{"current_steps": 82, "total_steps": 1850, "loss": 0.8324, "lr": 4.976386982687548e-06, "epoch": 0.44324324324324327, "percentage": 4.43, "elapsed_time": "0:04:16", "remaining_time": "1:32:20"} +{"current_steps": 83, "total_steps": 1850, "loss": 0.997, "lr": 4.9758012944307845e-06, "epoch": 0.4486486486486487, "percentage": 4.49, "elapsed_time": "0:04:22", "remaining_time": "1:33:03"} +{"current_steps": 84, "total_steps": 1850, "loss": 1.2024, "lr": 4.975208466605436e-06, "epoch": 0.4540540540540541, "percentage": 4.54, "elapsed_time": "0:04:24", "remaining_time": "1:32:49"} +{"current_steps": 85, "total_steps": 1850, "loss": 0.9146, "lr": 4.974608500921064e-06, "epoch": 0.4594594594594595, "percentage": 4.59, "elapsed_time": "0:04:27", "remaining_time": "1:32:27"} +{"current_steps": 86, "total_steps": 1850, "loss": 0.7181, "lr": 4.974001399107816e-06, "epoch": 0.4648648648648649, "percentage": 4.65, "elapsed_time": "0:04:29", "remaining_time": "1:31:58"} +{"current_steps": 87, "total_steps": 1850, "loss": 0.8599, "lr": 4.973387162916415e-06, "epoch": 0.4702702702702703, "percentage": 4.7, "elapsed_time": "0:04:33", "remaining_time": "1:32:19"} +{"current_steps": 88, "total_steps": 1850, "loss": 0.6081, "lr": 4.972765794118158e-06, "epoch": 0.4756756756756757, "percentage": 4.76, "elapsed_time": "0:04:34", "remaining_time": "1:31:29"} +{"current_steps": 89, "total_steps": 1850, "loss": 0.8764, "lr": 4.9721372945049114e-06, "epoch": 0.4810810810810811, "percentage": 4.81, "elapsed_time": "0:04:37", "remaining_time": "1:31:35"} +{"current_steps": 90, "total_steps": 1850, "loss": 0.8622, "lr": 4.971501665889107e-06, "epoch": 0.4864864864864865, "percentage": 4.86, "elapsed_time": "0:04:45", "remaining_time": "1:32:53"} +{"current_steps": 91, "total_steps": 1850, "loss": 0.5523, "lr": 4.9708589101037306e-06, "epoch": 0.4918918918918919, "percentage": 4.92, "elapsed_time": "0:04:48", "remaining_time": "1:32:52"} +{"current_steps": 92, "total_steps": 1850, "loss": 0.8922, "lr": 4.970209029002325e-06, "epoch": 0.4972972972972973, "percentage": 4.97, "elapsed_time": "0:04:55", "remaining_time": "1:34:06"} +{"current_steps": 93, "total_steps": 1850, "loss": 0.9455, "lr": 4.969552024458977e-06, "epoch": 0.5027027027027027, "percentage": 5.03, "elapsed_time": "0:04:59", "remaining_time": "1:34:25"} +{"current_steps": 94, "total_steps": 1850, "loss": 0.8342, "lr": 4.968887898368318e-06, "epoch": 0.5081081081081081, "percentage": 5.08, "elapsed_time": "0:05:05", "remaining_time": "1:35:14"} +{"current_steps": 95, "total_steps": 1850, "loss": 0.8476, "lr": 4.968216652645515e-06, "epoch": 0.5135135135135135, "percentage": 5.14, "elapsed_time": "0:05:11", "remaining_time": "1:35:56"} +{"current_steps": 96, "total_steps": 1850, "loss": 0.8879, "lr": 4.967538289226268e-06, "epoch": 0.518918918918919, "percentage": 5.19, "elapsed_time": "0:05:13", "remaining_time": "1:35:31"} +{"current_steps": 97, "total_steps": 1850, "loss": 0.7114, "lr": 4.966852810066798e-06, "epoch": 0.5243243243243243, "percentage": 5.24, "elapsed_time": "0:05:16", "remaining_time": "1:35:28"} +{"current_steps": 98, "total_steps": 1850, "loss": 0.6757, "lr": 4.9661602171438524e-06, "epoch": 0.5297297297297298, "percentage": 5.3, "elapsed_time": "0:05:18", "remaining_time": "1:34:47"} +{"current_steps": 99, "total_steps": 1850, "loss": 0.8029, "lr": 4.965460512454687e-06, "epoch": 0.5351351351351351, "percentage": 5.35, "elapsed_time": "0:05:20", "remaining_time": "1:34:27"} +{"current_steps": 100, "total_steps": 1850, "loss": 0.842, "lr": 4.964753698017071e-06, "epoch": 0.5405405405405406, "percentage": 5.41, "elapsed_time": "0:05:22", "remaining_time": "1:34:08"} +{"current_steps": 101, "total_steps": 1850, "loss": 0.6339, "lr": 4.964039775869271e-06, "epoch": 0.5459459459459459, "percentage": 5.46, "elapsed_time": "0:05:24", "remaining_time": "1:33:32"} +{"current_steps": 102, "total_steps": 1850, "loss": 0.7743, "lr": 4.963318748070056e-06, "epoch": 0.5513513513513514, "percentage": 5.51, "elapsed_time": "0:05:26", "remaining_time": "1:33:22"} +{"current_steps": 103, "total_steps": 1850, "loss": 0.926, "lr": 4.9625906166986815e-06, "epoch": 0.5567567567567567, "percentage": 5.57, "elapsed_time": "0:05:33", "remaining_time": "1:34:16"} +{"current_steps": 104, "total_steps": 1850, "loss": 0.7037, "lr": 4.961855383854889e-06, "epoch": 0.5621621621621622, "percentage": 5.62, "elapsed_time": "0:05:36", "remaining_time": "1:34:08"} +{"current_steps": 105, "total_steps": 1850, "loss": 0.561, "lr": 4.961113051658901e-06, "epoch": 0.5675675675675675, "percentage": 5.68, "elapsed_time": "0:05:38", "remaining_time": "1:33:46"} +{"current_steps": 106, "total_steps": 1850, "loss": 0.7316, "lr": 4.96036362225141e-06, "epoch": 0.572972972972973, "percentage": 5.73, "elapsed_time": "0:05:40", "remaining_time": "1:33:30"} +{"current_steps": 107, "total_steps": 1850, "loss": 0.6426, "lr": 4.959607097793575e-06, "epoch": 0.5783783783783784, "percentage": 5.78, "elapsed_time": "0:05:43", "remaining_time": "1:33:14"} +{"current_steps": 108, "total_steps": 1850, "loss": 1.0044, "lr": 4.9588434804670176e-06, "epoch": 0.5837837837837838, "percentage": 5.84, "elapsed_time": "0:05:50", "remaining_time": "1:34:17"} +{"current_steps": 109, "total_steps": 1850, "loss": 0.9219, "lr": 4.958072772473812e-06, "epoch": 0.5891891891891892, "percentage": 5.89, "elapsed_time": "0:05:54", "remaining_time": "1:34:14"} +{"current_steps": 110, "total_steps": 1850, "loss": 0.6056, "lr": 4.9572949760364795e-06, "epoch": 0.5945945945945946, "percentage": 5.95, "elapsed_time": "0:05:54", "remaining_time": "1:33:33"} +{"current_steps": 111, "total_steps": 1850, "loss": 0.6346, "lr": 4.9565100933979835e-06, "epoch": 0.6, "percentage": 6.0, "elapsed_time": "0:05:56", "remaining_time": "1:33:08"} +{"current_steps": 112, "total_steps": 1850, "loss": 0.9856, "lr": 4.9557181268217225e-06, "epoch": 0.6054054054054054, "percentage": 6.05, "elapsed_time": "0:05:58", "remaining_time": "1:32:46"} +{"current_steps": 113, "total_steps": 1850, "loss": 0.8669, "lr": 4.954919078591521e-06, "epoch": 0.6108108108108108, "percentage": 6.11, "elapsed_time": "0:06:00", "remaining_time": "1:32:27"} +{"current_steps": 114, "total_steps": 1850, "loss": 0.7201, "lr": 4.954112951011628e-06, "epoch": 0.6162162162162163, "percentage": 6.16, "elapsed_time": "0:06:04", "remaining_time": "1:32:28"} +{"current_steps": 115, "total_steps": 1850, "loss": 0.9095, "lr": 4.9532997464067065e-06, "epoch": 0.6216216216216216, "percentage": 6.22, "elapsed_time": "0:06:06", "remaining_time": "1:32:08"} +{"current_steps": 116, "total_steps": 1850, "loss": 1.0213, "lr": 4.952479467121828e-06, "epoch": 0.6270270270270271, "percentage": 6.27, "elapsed_time": "0:06:08", "remaining_time": "1:31:47"} +{"current_steps": 117, "total_steps": 1850, "loss": 1.1154, "lr": 4.951652115522463e-06, "epoch": 0.6324324324324324, "percentage": 6.32, "elapsed_time": "0:06:10", "remaining_time": "1:31:24"} +{"current_steps": 118, "total_steps": 1850, "loss": 0.691, "lr": 4.950817693994481e-06, "epoch": 0.6378378378378379, "percentage": 6.38, "elapsed_time": "0:06:13", "remaining_time": "1:31:29"} +{"current_steps": 119, "total_steps": 1850, "loss": 0.7224, "lr": 4.949976204944135e-06, "epoch": 0.6432432432432432, "percentage": 6.43, "elapsed_time": "0:06:17", "remaining_time": "1:31:24"} +{"current_steps": 120, "total_steps": 1850, "loss": 0.9256, "lr": 4.949127650798063e-06, "epoch": 0.6486486486486487, "percentage": 6.49, "elapsed_time": "0:06:18", "remaining_time": "1:30:49"} +{"current_steps": 121, "total_steps": 1850, "loss": 0.6892, "lr": 4.948272034003275e-06, "epoch": 0.654054054054054, "percentage": 6.54, "elapsed_time": "0:06:18", "remaining_time": "1:30:15"} +{"current_steps": 122, "total_steps": 1850, "loss": 0.5878, "lr": 4.947409357027148e-06, "epoch": 0.6594594594594595, "percentage": 6.59, "elapsed_time": "0:06:20", "remaining_time": "1:29:48"} +{"current_steps": 123, "total_steps": 1850, "loss": 0.9904, "lr": 4.9465396223574165e-06, "epoch": 0.6648648648648648, "percentage": 6.65, "elapsed_time": "0:06:25", "remaining_time": "1:30:08"} +{"current_steps": 124, "total_steps": 1850, "loss": 1.1592, "lr": 4.945662832502172e-06, "epoch": 0.6702702702702703, "percentage": 6.7, "elapsed_time": "0:06:31", "remaining_time": "1:30:51"} +{"current_steps": 125, "total_steps": 1850, "loss": 1.0041, "lr": 4.944778989989847e-06, "epoch": 0.6756756756756757, "percentage": 6.76, "elapsed_time": "0:06:36", "remaining_time": "1:31:07"} +{"current_steps": 126, "total_steps": 1850, "loss": 0.7045, "lr": 4.943888097369216e-06, "epoch": 0.6810810810810811, "percentage": 6.81, "elapsed_time": "0:06:39", "remaining_time": "1:31:09"} +{"current_steps": 127, "total_steps": 1850, "loss": 0.6685, "lr": 4.942990157209381e-06, "epoch": 0.6864864864864865, "percentage": 6.86, "elapsed_time": "0:06:41", "remaining_time": "1:30:48"} +{"current_steps": 128, "total_steps": 1850, "loss": 0.8812, "lr": 4.9420851720997674e-06, "epoch": 0.6918918918918919, "percentage": 6.92, "elapsed_time": "0:06:43", "remaining_time": "1:30:30"} +{"current_steps": 129, "total_steps": 1850, "loss": 1.3014, "lr": 4.94117314465012e-06, "epoch": 0.6972972972972973, "percentage": 6.97, "elapsed_time": "0:06:45", "remaining_time": "1:30:10"} +{"current_steps": 130, "total_steps": 1850, "loss": 0.6978, "lr": 4.940254077490487e-06, "epoch": 0.7027027027027027, "percentage": 7.03, "elapsed_time": "0:06:48", "remaining_time": "1:30:10"} +{"current_steps": 131, "total_steps": 1850, "loss": 0.6249, "lr": 4.939327973271222e-06, "epoch": 0.7081081081081081, "percentage": 7.08, "elapsed_time": "0:06:50", "remaining_time": "1:29:47"} +{"current_steps": 132, "total_steps": 1850, "loss": 0.6423, "lr": 4.9383948346629665e-06, "epoch": 0.7135135135135136, "percentage": 7.14, "elapsed_time": "0:06:51", "remaining_time": "1:29:21"} +{"current_steps": 133, "total_steps": 1850, "loss": 0.7193, "lr": 4.937454664356652e-06, "epoch": 0.7189189189189189, "percentage": 7.19, "elapsed_time": "0:06:53", "remaining_time": "1:29:02"} +{"current_steps": 134, "total_steps": 1850, "loss": 0.7065, "lr": 4.9365074650634855e-06, "epoch": 0.7243243243243244, "percentage": 7.24, "elapsed_time": "0:06:56", "remaining_time": "1:28:53"} +{"current_steps": 135, "total_steps": 1850, "loss": 1.0046, "lr": 4.9355532395149445e-06, "epoch": 0.7297297297297297, "percentage": 7.3, "elapsed_time": "0:06:59", "remaining_time": "1:28:43"} +{"current_steps": 136, "total_steps": 1850, "loss": 0.6771, "lr": 4.9345919904627655e-06, "epoch": 0.7351351351351352, "percentage": 7.35, "elapsed_time": "0:07:03", "remaining_time": "1:28:52"} +{"current_steps": 137, "total_steps": 1850, "loss": 0.6589, "lr": 4.933623720678944e-06, "epoch": 0.7405405405405405, "percentage": 7.41, "elapsed_time": "0:07:06", "remaining_time": "1:28:56"} +{"current_steps": 138, "total_steps": 1850, "loss": 0.8755, "lr": 4.932648432955718e-06, "epoch": 0.745945945945946, "percentage": 7.46, "elapsed_time": "0:07:10", "remaining_time": "1:28:55"} +{"current_steps": 139, "total_steps": 1850, "loss": 0.6685, "lr": 4.931666130105564e-06, "epoch": 0.7513513513513513, "percentage": 7.51, "elapsed_time": "0:07:13", "remaining_time": "1:28:59"} +{"current_steps": 140, "total_steps": 1850, "loss": 0.8101, "lr": 4.930676814961189e-06, "epoch": 0.7567567567567568, "percentage": 7.57, "elapsed_time": "0:07:15", "remaining_time": "1:28:35"} +{"current_steps": 141, "total_steps": 1850, "loss": 0.8193, "lr": 4.92968049037552e-06, "epoch": 0.7621621621621621, "percentage": 7.62, "elapsed_time": "0:07:18", "remaining_time": "1:28:35"} +{"current_steps": 142, "total_steps": 1850, "loss": 0.7852, "lr": 4.9286771592217005e-06, "epoch": 0.7675675675675676, "percentage": 7.68, "elapsed_time": "0:07:20", "remaining_time": "1:28:15"} +{"current_steps": 143, "total_steps": 1850, "loss": 1.0388, "lr": 4.927666824393076e-06, "epoch": 0.772972972972973, "percentage": 7.73, "elapsed_time": "0:07:23", "remaining_time": "1:28:14"} +{"current_steps": 144, "total_steps": 1850, "loss": 0.8266, "lr": 4.926649488803191e-06, "epoch": 0.7783783783783784, "percentage": 7.78, "elapsed_time": "0:07:27", "remaining_time": "1:28:20"} +{"current_steps": 145, "total_steps": 1850, "loss": 0.4895, "lr": 4.925625155385776e-06, "epoch": 0.7837837837837838, "percentage": 7.84, "elapsed_time": "0:07:30", "remaining_time": "1:28:13"} +{"current_steps": 146, "total_steps": 1850, "loss": 0.8759, "lr": 4.924593827094743e-06, "epoch": 0.7891891891891892, "percentage": 7.89, "elapsed_time": "0:07:31", "remaining_time": "1:27:52"} +{"current_steps": 147, "total_steps": 1850, "loss": 0.701, "lr": 4.923555506904176e-06, "epoch": 0.7945945945945946, "percentage": 7.95, "elapsed_time": "0:07:34", "remaining_time": "1:27:42"} +{"current_steps": 148, "total_steps": 1850, "loss": 1.1327, "lr": 4.922510197808321e-06, "epoch": 0.8, "percentage": 8.0, "elapsed_time": "0:07:36", "remaining_time": "1:27:29"} +{"current_steps": 149, "total_steps": 1850, "loss": 0.7587, "lr": 4.921457902821578e-06, "epoch": 0.8054054054054054, "percentage": 8.05, "elapsed_time": "0:07:41", "remaining_time": "1:27:44"} +{"current_steps": 150, "total_steps": 1850, "loss": 1.2158, "lr": 4.920398624978493e-06, "epoch": 0.8108108108108109, "percentage": 8.11, "elapsed_time": "0:07:43", "remaining_time": "1:27:33"} +{"current_steps": 151, "total_steps": 1850, "loss": 0.6852, "lr": 4.919332367333748e-06, "epoch": 0.8162162162162162, "percentage": 8.16, "elapsed_time": "0:07:46", "remaining_time": "1:27:32"} +{"current_steps": 152, "total_steps": 1850, "loss": 0.6611, "lr": 4.918259132962154e-06, "epoch": 0.8216216216216217, "percentage": 8.22, "elapsed_time": "0:07:49", "remaining_time": "1:27:19"} +{"current_steps": 153, "total_steps": 1850, "loss": 0.7327, "lr": 4.917178924958638e-06, "epoch": 0.827027027027027, "percentage": 8.27, "elapsed_time": "0:07:50", "remaining_time": "1:26:58"} +{"current_steps": 154, "total_steps": 1850, "loss": 0.8528, "lr": 4.916091746438243e-06, "epoch": 0.8324324324324325, "percentage": 8.32, "elapsed_time": "0:07:51", "remaining_time": "1:26:36"} +{"current_steps": 155, "total_steps": 1850, "loss": 0.9141, "lr": 4.9149976005361085e-06, "epoch": 0.8378378378378378, "percentage": 8.38, "elapsed_time": "0:07:55", "remaining_time": "1:26:35"} +{"current_steps": 156, "total_steps": 1850, "loss": 1.1132, "lr": 4.913896490407467e-06, "epoch": 0.8432432432432433, "percentage": 8.43, "elapsed_time": "0:07:57", "remaining_time": "1:26:27"} +{"current_steps": 157, "total_steps": 1850, "loss": 0.7587, "lr": 4.912788419227635e-06, "epoch": 0.8486486486486486, "percentage": 8.49, "elapsed_time": "0:08:01", "remaining_time": "1:26:29"} +{"current_steps": 158, "total_steps": 1850, "loss": 0.9227, "lr": 4.911673390192002e-06, "epoch": 0.8540540540540541, "percentage": 8.54, "elapsed_time": "0:08:03", "remaining_time": "1:26:16"} +{"current_steps": 159, "total_steps": 1850, "loss": 0.8154, "lr": 4.910551406516023e-06, "epoch": 0.8594594594594595, "percentage": 8.59, "elapsed_time": "0:08:07", "remaining_time": "1:26:23"} +{"current_steps": 160, "total_steps": 1850, "loss": 0.9897, "lr": 4.909422471435207e-06, "epoch": 0.8648648648648649, "percentage": 8.65, "elapsed_time": "0:08:09", "remaining_time": "1:26:14"} +{"current_steps": 161, "total_steps": 1850, "loss": 0.6162, "lr": 4.90828658820511e-06, "epoch": 0.8702702702702703, "percentage": 8.7, "elapsed_time": "0:08:11", "remaining_time": "1:25:52"} +{"current_steps": 162, "total_steps": 1850, "loss": 0.5734, "lr": 4.907143760101325e-06, "epoch": 0.8756756756756757, "percentage": 8.76, "elapsed_time": "0:08:12", "remaining_time": "1:25:31"} +{"current_steps": 163, "total_steps": 1850, "loss": 0.8328, "lr": 4.905993990419472e-06, "epoch": 0.8810810810810811, "percentage": 8.81, "elapsed_time": "0:08:19", "remaining_time": "1:26:09"} +{"current_steps": 164, "total_steps": 1850, "loss": 0.6787, "lr": 4.904837282475187e-06, "epoch": 0.8864864864864865, "percentage": 8.86, "elapsed_time": "0:08:21", "remaining_time": "1:25:51"} +{"current_steps": 165, "total_steps": 1850, "loss": 0.9658, "lr": 4.9036736396041165e-06, "epoch": 0.8918918918918919, "percentage": 8.92, "elapsed_time": "0:08:24", "remaining_time": "1:25:49"} +{"current_steps": 166, "total_steps": 1850, "loss": 0.7899, "lr": 4.902503065161905e-06, "epoch": 0.8972972972972973, "percentage": 8.97, "elapsed_time": "0:08:30", "remaining_time": "1:26:16"} +{"current_steps": 167, "total_steps": 1850, "loss": 0.9476, "lr": 4.901325562524185e-06, "epoch": 0.9027027027027027, "percentage": 9.03, "elapsed_time": "0:08:34", "remaining_time": "1:26:20"} +{"current_steps": 168, "total_steps": 1850, "loss": 0.7589, "lr": 4.900141135086569e-06, "epoch": 0.9081081081081082, "percentage": 9.08, "elapsed_time": "0:08:41", "remaining_time": "1:26:57"} +{"current_steps": 169, "total_steps": 1850, "loss": 0.6724, "lr": 4.898949786264638e-06, "epoch": 0.9135135135135135, "percentage": 9.14, "elapsed_time": "0:08:43", "remaining_time": "1:26:49"} +{"current_steps": 170, "total_steps": 1850, "loss": 0.6968, "lr": 4.897751519493933e-06, "epoch": 0.918918918918919, "percentage": 9.19, "elapsed_time": "0:08:47", "remaining_time": "1:26:57"} +{"current_steps": 171, "total_steps": 1850, "loss": 0.7984, "lr": 4.896546338229945e-06, "epoch": 0.9243243243243243, "percentage": 9.24, "elapsed_time": "0:08:50", "remaining_time": "1:26:46"} +{"current_steps": 172, "total_steps": 1850, "loss": 0.6109, "lr": 4.8953342459481034e-06, "epoch": 0.9297297297297298, "percentage": 9.3, "elapsed_time": "0:08:53", "remaining_time": "1:26:47"} +{"current_steps": 173, "total_steps": 1850, "loss": 0.8126, "lr": 4.894115246143768e-06, "epoch": 0.9351351351351351, "percentage": 9.35, "elapsed_time": "0:08:57", "remaining_time": "1:26:51"} +{"current_steps": 174, "total_steps": 1850, "loss": 0.6862, "lr": 4.892889342332218e-06, "epoch": 0.9405405405405406, "percentage": 9.41, "elapsed_time": "0:08:59", "remaining_time": "1:26:40"} +{"current_steps": 175, "total_steps": 1850, "loss": 0.9895, "lr": 4.891656538048642e-06, "epoch": 0.9459459459459459, "percentage": 9.46, "elapsed_time": "0:09:06", "remaining_time": "1:27:09"} +{"current_steps": 176, "total_steps": 1850, "loss": 0.8481, "lr": 4.890416836848128e-06, "epoch": 0.9513513513513514, "percentage": 9.51, "elapsed_time": "0:09:08", "remaining_time": "1:27:01"} +{"current_steps": 177, "total_steps": 1850, "loss": 0.6478, "lr": 4.889170242305652e-06, "epoch": 0.9567567567567568, "percentage": 9.57, "elapsed_time": "0:09:10", "remaining_time": "1:26:42"} +{"current_steps": 178, "total_steps": 1850, "loss": 0.9714, "lr": 4.887916758016069e-06, "epoch": 0.9621621621621622, "percentage": 9.62, "elapsed_time": "0:09:13", "remaining_time": "1:26:39"} +{"current_steps": 179, "total_steps": 1850, "loss": 1.1264, "lr": 4.886656387594104e-06, "epoch": 0.9675675675675676, "percentage": 9.68, "elapsed_time": "0:09:17", "remaining_time": "1:26:42"} +{"current_steps": 180, "total_steps": 1850, "loss": 0.7664, "lr": 4.885389134674338e-06, "epoch": 0.972972972972973, "percentage": 9.73, "elapsed_time": "0:09:21", "remaining_time": "1:26:50"} +{"current_steps": 181, "total_steps": 1850, "loss": 0.6131, "lr": 4.884115002911197e-06, "epoch": 0.9783783783783784, "percentage": 9.78, "elapsed_time": "0:09:23", "remaining_time": "1:26:32"} +{"current_steps": 182, "total_steps": 1850, "loss": 0.8733, "lr": 4.88283399597895e-06, "epoch": 0.9837837837837838, "percentage": 9.84, "elapsed_time": "0:09:27", "remaining_time": "1:26:45"} +{"current_steps": 183, "total_steps": 1850, "loss": 0.643, "lr": 4.881546117571686e-06, "epoch": 0.9891891891891892, "percentage": 9.89, "elapsed_time": "0:09:30", "remaining_time": "1:26:36"} +{"current_steps": 184, "total_steps": 1850, "loss": 0.7287, "lr": 4.8802513714033135e-06, "epoch": 0.9945945945945946, "percentage": 9.95, "elapsed_time": "0:09:35", "remaining_time": "1:26:53"} +{"current_steps": 185, "total_steps": 1850, "loss": 0.9927, "lr": 4.878949761207545e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:09:37", "remaining_time": "1:26:37"} +{"current_steps": 186, "total_steps": 1850, "loss": 0.66, "lr": 4.8776412907378845e-06, "epoch": 1.0054054054054054, "percentage": 10.05, "elapsed_time": "1:13:09", "remaining_time": "10:54:31"} +{"current_steps": 187, "total_steps": 1850, "loss": 0.594, "lr": 4.876325963767623e-06, "epoch": 1.0108108108108107, "percentage": 10.11, "elapsed_time": "1:13:11", "remaining_time": "10:50:52"} +{"current_steps": 188, "total_steps": 1850, "loss": 0.5825, "lr": 4.875003784089822e-06, "epoch": 1.0162162162162163, "percentage": 10.16, "elapsed_time": "1:13:15", "remaining_time": "10:47:35"} +{"current_steps": 189, "total_steps": 1850, "loss": 0.6594, "lr": 4.873674755517305e-06, "epoch": 1.0216216216216216, "percentage": 10.22, "elapsed_time": "1:13:17", "remaining_time": "10:44:06"} +{"current_steps": 190, "total_steps": 1850, "loss": 0.7536, "lr": 4.872338881882645e-06, "epoch": 1.027027027027027, "percentage": 10.27, "elapsed_time": "1:13:20", "remaining_time": "10:40:49"} +{"current_steps": 191, "total_steps": 1850, "loss": 0.4849, "lr": 4.870996167038154e-06, "epoch": 1.0324324324324325, "percentage": 10.32, "elapsed_time": "1:13:24", "remaining_time": "10:37:37"} +{"current_steps": 192, "total_steps": 1850, "loss": 0.3771, "lr": 4.869646614855877e-06, "epoch": 1.037837837837838, "percentage": 10.38, "elapsed_time": "1:13:26", "remaining_time": "10:34:12"} +{"current_steps": 193, "total_steps": 1850, "loss": 0.8545, "lr": 4.868290229227567e-06, "epoch": 1.0432432432432432, "percentage": 10.43, "elapsed_time": "1:13:33", "remaining_time": "10:31:28"} +{"current_steps": 194, "total_steps": 1850, "loss": 0.3698, "lr": 4.866927014064692e-06, "epoch": 1.0486486486486486, "percentage": 10.49, "elapsed_time": "1:13:34", "remaining_time": "10:28:05"} +{"current_steps": 195, "total_steps": 1850, "loss": 0.8468, "lr": 4.86555697329841e-06, "epoch": 1.054054054054054, "percentage": 10.54, "elapsed_time": "1:13:37", "remaining_time": "10:24:55"} +{"current_steps": 196, "total_steps": 1850, "loss": 0.8232, "lr": 4.864180110879562e-06, "epoch": 1.0594594594594595, "percentage": 10.59, "elapsed_time": "1:13:44", "remaining_time": "10:22:19"} +{"current_steps": 197, "total_steps": 1850, "loss": 0.4097, "lr": 4.862796430778663e-06, "epoch": 1.0648648648648649, "percentage": 10.65, "elapsed_time": "1:13:46", "remaining_time": "10:18:59"} +{"current_steps": 198, "total_steps": 1850, "loss": 0.6746, "lr": 4.861405936985889e-06, "epoch": 1.0702702702702702, "percentage": 10.7, "elapsed_time": "1:13:50", "remaining_time": "10:16:08"} +{"current_steps": 199, "total_steps": 1850, "loss": 0.6605, "lr": 4.860008633511059e-06, "epoch": 1.0756756756756758, "percentage": 10.76, "elapsed_time": "1:13:55", "remaining_time": "10:13:19"} +{"current_steps": 200, "total_steps": 1850, "loss": 0.471, "lr": 4.8586045243836384e-06, "epoch": 1.0810810810810811, "percentage": 10.81, "elapsed_time": "1:13:59", "remaining_time": "10:10:26"} +{"current_steps": 201, "total_steps": 1850, "loss": 0.7665, "lr": 4.857193613652711e-06, "epoch": 1.0864864864864865, "percentage": 10.86, "elapsed_time": "1:14:06", "remaining_time": "10:07:56"} +{"current_steps": 202, "total_steps": 1850, "loss": 0.6436, "lr": 4.8557759053869775e-06, "epoch": 1.0918918918918918, "percentage": 10.92, "elapsed_time": "1:14:07", "remaining_time": "10:04:41"} +{"current_steps": 203, "total_steps": 1850, "loss": 0.4642, "lr": 4.854351403674741e-06, "epoch": 1.0972972972972972, "percentage": 10.97, "elapsed_time": "1:14:09", "remaining_time": "10:01:36"} +{"current_steps": 204, "total_steps": 1850, "loss": 0.5737, "lr": 4.852920112623895e-06, "epoch": 1.1027027027027028, "percentage": 11.03, "elapsed_time": "1:14:10", "remaining_time": "9:58:28"} +{"current_steps": 205, "total_steps": 1850, "loss": 0.7302, "lr": 4.851482036361912e-06, "epoch": 1.1081081081081081, "percentage": 11.08, "elapsed_time": "1:14:11", "remaining_time": "9:55:19"} +{"current_steps": 206, "total_steps": 1850, "loss": 0.5229, "lr": 4.850037179035829e-06, "epoch": 1.1135135135135135, "percentage": 11.14, "elapsed_time": "1:14:13", "remaining_time": "9:52:21"} +{"current_steps": 207, "total_steps": 1850, "loss": 0.5529, "lr": 4.8485855448122425e-06, "epoch": 1.118918918918919, "percentage": 11.19, "elapsed_time": "1:14:15", "remaining_time": "9:49:24"} +{"current_steps": 208, "total_steps": 1850, "loss": 0.3635, "lr": 4.847127137877286e-06, "epoch": 1.1243243243243244, "percentage": 11.24, "elapsed_time": "1:14:17", "remaining_time": "9:46:27"} +{"current_steps": 209, "total_steps": 1850, "loss": 0.8149, "lr": 4.8456619624366285e-06, "epoch": 1.1297297297297297, "percentage": 11.3, "elapsed_time": "1:14:20", "remaining_time": "9:43:39"} +{"current_steps": 210, "total_steps": 1850, "loss": 0.8333, "lr": 4.844190022715456e-06, "epoch": 1.135135135135135, "percentage": 11.35, "elapsed_time": "1:14:23", "remaining_time": "9:40:55"} +{"current_steps": 211, "total_steps": 1850, "loss": 0.3717, "lr": 4.84271132295846e-06, "epoch": 1.1405405405405404, "percentage": 11.41, "elapsed_time": "1:14:26", "remaining_time": "9:38:16"} +{"current_steps": 212, "total_steps": 1850, "loss": 0.5994, "lr": 4.841225867429826e-06, "epoch": 1.145945945945946, "percentage": 11.46, "elapsed_time": "1:14:28", "remaining_time": "9:35:28"} +{"current_steps": 213, "total_steps": 1850, "loss": 0.8382, "lr": 4.839733660413224e-06, "epoch": 1.1513513513513514, "percentage": 11.51, "elapsed_time": "1:14:31", "remaining_time": "9:32:48"} +{"current_steps": 214, "total_steps": 1850, "loss": 0.818, "lr": 4.838234706211792e-06, "epoch": 1.1567567567567567, "percentage": 11.57, "elapsed_time": "1:14:35", "remaining_time": "9:30:11"} +{"current_steps": 215, "total_steps": 1850, "loss": 0.4267, "lr": 4.836729009148124e-06, "epoch": 1.1621621621621623, "percentage": 11.62, "elapsed_time": "1:14:38", "remaining_time": "9:27:37"} +{"current_steps": 216, "total_steps": 1850, "loss": 0.3472, "lr": 4.835216573564261e-06, "epoch": 1.1675675675675676, "percentage": 11.68, "elapsed_time": "1:14:41", "remaining_time": "9:25:04"} +{"current_steps": 217, "total_steps": 1850, "loss": 0.6323, "lr": 4.833697403821672e-06, "epoch": 1.172972972972973, "percentage": 11.73, "elapsed_time": "1:14:46", "remaining_time": "9:22:40"} +{"current_steps": 218, "total_steps": 1850, "loss": 0.6831, "lr": 4.8321715043012516e-06, "epoch": 1.1783783783783783, "percentage": 11.78, "elapsed_time": "1:14:49", "remaining_time": "9:20:08"} +{"current_steps": 219, "total_steps": 1850, "loss": 0.3682, "lr": 4.830638879403296e-06, "epoch": 1.1837837837837837, "percentage": 11.84, "elapsed_time": "1:14:51", "remaining_time": "9:17:26"} +{"current_steps": 220, "total_steps": 1850, "loss": 0.4154, "lr": 4.8290995335475e-06, "epoch": 1.1891891891891893, "percentage": 11.89, "elapsed_time": "1:14:53", "remaining_time": "9:14:51"} +{"current_steps": 221, "total_steps": 1850, "loss": 0.3991, "lr": 4.827553471172935e-06, "epoch": 1.1945945945945946, "percentage": 11.95, "elapsed_time": "1:14:54", "remaining_time": "9:12:11"} +{"current_steps": 222, "total_steps": 1850, "loss": 0.4538, "lr": 4.826000696738045e-06, "epoch": 1.2, "percentage": 12.0, "elapsed_time": "1:14:57", "remaining_time": "9:09:38"} +{"current_steps": 223, "total_steps": 1850, "loss": 0.7692, "lr": 4.824441214720629e-06, "epoch": 1.2054054054054055, "percentage": 12.05, "elapsed_time": "1:14:59", "remaining_time": "9:07:07"} +{"current_steps": 224, "total_steps": 1850, "loss": 0.6038, "lr": 4.8228750296178275e-06, "epoch": 1.2108108108108109, "percentage": 12.11, "elapsed_time": "1:15:02", "remaining_time": "9:04:44"} +{"current_steps": 225, "total_steps": 1850, "loss": 0.4147, "lr": 4.821302145946113e-06, "epoch": 1.2162162162162162, "percentage": 12.16, "elapsed_time": "1:15:05", "remaining_time": "9:02:16"} +{"current_steps": 226, "total_steps": 1850, "loss": 0.5398, "lr": 4.819722568241274e-06, "epoch": 1.2216216216216216, "percentage": 12.22, "elapsed_time": "1:15:07", "remaining_time": "8:59:53"} +{"current_steps": 227, "total_steps": 1850, "loss": 0.3864, "lr": 4.818136301058401e-06, "epoch": 1.227027027027027, "percentage": 12.27, "elapsed_time": "1:15:11", "remaining_time": "8:57:37"} +{"current_steps": 228, "total_steps": 1850, "loss": 0.5712, "lr": 4.816543348971879e-06, "epoch": 1.2324324324324325, "percentage": 12.32, "elapsed_time": "1:15:15", "remaining_time": "8:55:21"} +{"current_steps": 229, "total_steps": 1850, "loss": 0.662, "lr": 4.814943716575368e-06, "epoch": 1.2378378378378379, "percentage": 12.38, "elapsed_time": "1:15:16", "remaining_time": "8:52:49"} +{"current_steps": 230, "total_steps": 1850, "loss": 0.8661, "lr": 4.813337408481793e-06, "epoch": 1.2432432432432432, "percentage": 12.43, "elapsed_time": "1:15:19", "remaining_time": "8:50:32"} +{"current_steps": 231, "total_steps": 1850, "loss": 0.9218, "lr": 4.811724429323329e-06, "epoch": 1.2486486486486488, "percentage": 12.49, "elapsed_time": "1:15:21", "remaining_time": "8:48:08"} +{"current_steps": 232, "total_steps": 1850, "loss": 0.5597, "lr": 4.810104783751389e-06, "epoch": 1.2540540540540541, "percentage": 12.54, "elapsed_time": "1:15:22", "remaining_time": "8:45:43"} +{"current_steps": 233, "total_steps": 1850, "loss": 0.4786, "lr": 4.8084784764366125e-06, "epoch": 1.2594594594594595, "percentage": 12.59, "elapsed_time": "1:15:24", "remaining_time": "8:43:17"} +{"current_steps": 234, "total_steps": 1850, "loss": 0.5219, "lr": 4.806845512068846e-06, "epoch": 1.2648648648648648, "percentage": 12.65, "elapsed_time": "1:15:28", "remaining_time": "8:41:15"} +{"current_steps": 235, "total_steps": 1850, "loss": 0.643, "lr": 4.805205895357137e-06, "epoch": 1.2702702702702702, "percentage": 12.7, "elapsed_time": "1:15:29", "remaining_time": "8:38:50"} +{"current_steps": 236, "total_steps": 1850, "loss": 0.5858, "lr": 4.803559631029713e-06, "epoch": 1.2756756756756757, "percentage": 12.76, "elapsed_time": "1:15:32", "remaining_time": "8:36:35"} +{"current_steps": 237, "total_steps": 1850, "loss": 0.4185, "lr": 4.801906723833973e-06, "epoch": 1.281081081081081, "percentage": 12.81, "elapsed_time": "1:15:35", "remaining_time": "8:34:25"} +{"current_steps": 238, "total_steps": 1850, "loss": 0.4917, "lr": 4.8002471785364734e-06, "epoch": 1.2864864864864864, "percentage": 12.86, "elapsed_time": "1:15:36", "remaining_time": "8:32:08"} +{"current_steps": 239, "total_steps": 1850, "loss": 0.645, "lr": 4.798580999922913e-06, "epoch": 1.291891891891892, "percentage": 12.92, "elapsed_time": "1:15:38", "remaining_time": "8:29:52"} +{"current_steps": 240, "total_steps": 1850, "loss": 0.5378, "lr": 4.796908192798117e-06, "epoch": 1.2972972972972974, "percentage": 12.97, "elapsed_time": "1:15:39", "remaining_time": "8:27:31"} +{"current_steps": 241, "total_steps": 1850, "loss": 0.5197, "lr": 4.7952287619860276e-06, "epoch": 1.3027027027027027, "percentage": 13.03, "elapsed_time": "1:15:42", "remaining_time": "8:25:24"} +{"current_steps": 242, "total_steps": 1850, "loss": 1.0226, "lr": 4.793542712329689e-06, "epoch": 1.308108108108108, "percentage": 13.08, "elapsed_time": "1:15:45", "remaining_time": "8:23:21"} +{"current_steps": 243, "total_steps": 1850, "loss": 0.5502, "lr": 4.791850048691228e-06, "epoch": 1.3135135135135134, "percentage": 13.14, "elapsed_time": "1:15:49", "remaining_time": "8:21:27"} +{"current_steps": 244, "total_steps": 1850, "loss": 0.6976, "lr": 4.79015077595185e-06, "epoch": 1.318918918918919, "percentage": 13.19, "elapsed_time": "1:15:56", "remaining_time": "8:19:47"} +{"current_steps": 245, "total_steps": 1850, "loss": 0.4795, "lr": 4.788444899011816e-06, "epoch": 1.3243243243243243, "percentage": 13.24, "elapsed_time": "1:15:58", "remaining_time": "8:17:44"} +{"current_steps": 246, "total_steps": 1850, "loss": 0.6526, "lr": 4.786732422790432e-06, "epoch": 1.3297297297297297, "percentage": 13.3, "elapsed_time": "1:16:02", "remaining_time": "8:15:46"} +{"current_steps": 247, "total_steps": 1850, "loss": 0.5551, "lr": 4.785013352226036e-06, "epoch": 1.3351351351351353, "percentage": 13.35, "elapsed_time": "1:16:03", "remaining_time": "8:13:34"} +{"current_steps": 248, "total_steps": 1850, "loss": 0.3151, "lr": 4.7832876922759805e-06, "epoch": 1.3405405405405406, "percentage": 13.41, "elapsed_time": "1:16:05", "remaining_time": "8:11:31"} +{"current_steps": 249, "total_steps": 1850, "loss": 0.6713, "lr": 4.781555447916622e-06, "epoch": 1.345945945945946, "percentage": 13.46, "elapsed_time": "1:16:08", "remaining_time": "8:09:31"} +{"current_steps": 250, "total_steps": 1850, "loss": 0.437, "lr": 4.779816624143302e-06, "epoch": 1.3513513513513513, "percentage": 13.51, "elapsed_time": "1:16:09", "remaining_time": "8:07:25"} +{"current_steps": 251, "total_steps": 1850, "loss": 0.7632, "lr": 4.77807122597034e-06, "epoch": 1.3567567567567567, "percentage": 13.57, "elapsed_time": "1:16:13", "remaining_time": "8:05:34"} +{"current_steps": 252, "total_steps": 1850, "loss": 0.4894, "lr": 4.776319258431009e-06, "epoch": 1.3621621621621622, "percentage": 13.62, "elapsed_time": "1:16:17", "remaining_time": "8:03:45"} +{"current_steps": 253, "total_steps": 1850, "loss": 0.4456, "lr": 4.77456072657753e-06, "epoch": 1.3675675675675676, "percentage": 13.68, "elapsed_time": "1:16:19", "remaining_time": "8:01:46"} +{"current_steps": 254, "total_steps": 1850, "loss": 0.5381, "lr": 4.772795635481053e-06, "epoch": 1.372972972972973, "percentage": 13.73, "elapsed_time": "1:16:21", "remaining_time": "7:59:45"} +{"current_steps": 255, "total_steps": 1850, "loss": 1.0302, "lr": 4.77102399023164e-06, "epoch": 1.3783783783783785, "percentage": 13.78, "elapsed_time": "1:16:24", "remaining_time": "7:57:55"} +{"current_steps": 256, "total_steps": 1850, "loss": 0.4875, "lr": 4.769245795938261e-06, "epoch": 1.3837837837837839, "percentage": 13.84, "elapsed_time": "1:16:26", "remaining_time": "7:55:57"} +{"current_steps": 257, "total_steps": 1850, "loss": 0.4923, "lr": 4.767461057728763e-06, "epoch": 1.3891891891891892, "percentage": 13.89, "elapsed_time": "1:16:29", "remaining_time": "7:54:07"} +{"current_steps": 258, "total_steps": 1850, "loss": 0.6699, "lr": 4.76566978074987e-06, "epoch": 1.3945945945945946, "percentage": 13.95, "elapsed_time": "1:16:33", "remaining_time": "7:52:25"} +{"current_steps": 259, "total_steps": 1850, "loss": 0.6117, "lr": 4.7638719701671586e-06, "epoch": 1.4, "percentage": 14.0, "elapsed_time": "1:16:37", "remaining_time": "7:50:40"} +{"current_steps": 260, "total_steps": 1850, "loss": 0.8534, "lr": 4.762067631165049e-06, "epoch": 1.4054054054054055, "percentage": 14.05, "elapsed_time": "1:16:40", "remaining_time": "7:48:55"} +{"current_steps": 261, "total_steps": 1850, "loss": 0.5057, "lr": 4.760256768946787e-06, "epoch": 1.4108108108108108, "percentage": 14.11, "elapsed_time": "1:16:47", "remaining_time": "7:47:34"} +{"current_steps": 262, "total_steps": 1850, "loss": 0.7286, "lr": 4.758439388734429e-06, "epoch": 1.4162162162162162, "percentage": 14.16, "elapsed_time": "1:16:50", "remaining_time": "7:45:45"} +{"current_steps": 263, "total_steps": 1850, "loss": 0.9827, "lr": 4.7566154957688276e-06, "epoch": 1.4216216216216218, "percentage": 14.22, "elapsed_time": "1:16:52", "remaining_time": "7:43:50"} +{"current_steps": 264, "total_steps": 1850, "loss": 0.7042, "lr": 4.754785095309617e-06, "epoch": 1.427027027027027, "percentage": 14.27, "elapsed_time": "1:16:53", "remaining_time": "7:41:56"} +{"current_steps": 265, "total_steps": 1850, "loss": 0.5179, "lr": 4.752948192635199e-06, "epoch": 1.4324324324324325, "percentage": 14.32, "elapsed_time": "1:16:55", "remaining_time": "7:40:04"} +{"current_steps": 266, "total_steps": 1850, "loss": 0.8527, "lr": 4.751104793042722e-06, "epoch": 1.4378378378378378, "percentage": 14.38, "elapsed_time": "1:16:59", "remaining_time": "7:38:28"} +{"current_steps": 267, "total_steps": 1850, "loss": 0.5627, "lr": 4.7492549018480725e-06, "epoch": 1.4432432432432432, "percentage": 14.43, "elapsed_time": "1:17:02", "remaining_time": "7:36:46"} +{"current_steps": 268, "total_steps": 1850, "loss": 0.8981, "lr": 4.747398524385858e-06, "epoch": 1.4486486486486487, "percentage": 14.49, "elapsed_time": "1:17:05", "remaining_time": "7:35:01"} +{"current_steps": 269, "total_steps": 1850, "loss": 0.5455, "lr": 4.745535666009389e-06, "epoch": 1.454054054054054, "percentage": 14.54, "elapsed_time": "1:17:07", "remaining_time": "7:33:17"} +{"current_steps": 270, "total_steps": 1850, "loss": 0.4348, "lr": 4.743666332090664e-06, "epoch": 1.4594594594594594, "percentage": 14.59, "elapsed_time": "1:17:10", "remaining_time": "7:31:37"} +{"current_steps": 271, "total_steps": 1850, "loss": 0.5524, "lr": 4.74179052802036e-06, "epoch": 1.464864864864865, "percentage": 14.65, "elapsed_time": "1:17:13", "remaining_time": "7:29:54"} +{"current_steps": 272, "total_steps": 1850, "loss": 0.7469, "lr": 4.739908259207807e-06, "epoch": 1.4702702702702704, "percentage": 14.7, "elapsed_time": "1:17:14", "remaining_time": "7:28:09"} +{"current_steps": 273, "total_steps": 1850, "loss": 0.7216, "lr": 4.738019531080981e-06, "epoch": 1.4756756756756757, "percentage": 14.76, "elapsed_time": "1:17:18", "remaining_time": "7:26:32"} +{"current_steps": 274, "total_steps": 1850, "loss": 0.7527, "lr": 4.7361243490864825e-06, "epoch": 1.481081081081081, "percentage": 14.81, "elapsed_time": "1:17:23", "remaining_time": "7:25:10"} +{"current_steps": 275, "total_steps": 1850, "loss": 0.7437, "lr": 4.734222718689527e-06, "epoch": 1.4864864864864864, "percentage": 14.86, "elapsed_time": "1:17:29", "remaining_time": "7:23:47"} +{"current_steps": 276, "total_steps": 1850, "loss": 0.5187, "lr": 4.732314645373922e-06, "epoch": 1.491891891891892, "percentage": 14.92, "elapsed_time": "1:17:32", "remaining_time": "7:22:11"} +{"current_steps": 277, "total_steps": 1850, "loss": 0.7186, "lr": 4.730400134642055e-06, "epoch": 1.4972972972972973, "percentage": 14.97, "elapsed_time": "1:17:35", "remaining_time": "7:20:38"} +{"current_steps": 278, "total_steps": 1850, "loss": 0.9655, "lr": 4.728479192014879e-06, "epoch": 1.5027027027027027, "percentage": 15.03, "elapsed_time": "1:17:42", "remaining_time": "7:19:23"} +{"current_steps": 279, "total_steps": 1850, "loss": 0.6251, "lr": 4.726551823031895e-06, "epoch": 1.5081081081081082, "percentage": 15.08, "elapsed_time": "1:17:46", "remaining_time": "7:17:57"} +{"current_steps": 280, "total_steps": 1850, "loss": 0.4805, "lr": 4.7246180332511335e-06, "epoch": 1.5135135135135136, "percentage": 15.14, "elapsed_time": "1:17:49", "remaining_time": "7:16:19"} +{"current_steps": 281, "total_steps": 1850, "loss": 1.0939, "lr": 4.722677828249142e-06, "epoch": 1.518918918918919, "percentage": 15.19, "elapsed_time": "1:17:52", "remaining_time": "7:14:49"} +{"current_steps": 282, "total_steps": 1850, "loss": 0.9485, "lr": 4.720731213620972e-06, "epoch": 1.5243243243243243, "percentage": 15.24, "elapsed_time": "1:17:55", "remaining_time": "7:13:18"} +{"current_steps": 283, "total_steps": 1850, "loss": 0.5805, "lr": 4.718778194980152e-06, "epoch": 1.5297297297297296, "percentage": 15.3, "elapsed_time": "1:17:59", "remaining_time": "7:11:48"} +{"current_steps": 284, "total_steps": 1850, "loss": 0.77, "lr": 4.7168187779586805e-06, "epoch": 1.535135135135135, "percentage": 15.35, "elapsed_time": "1:18:02", "remaining_time": "7:10:18"} +{"current_steps": 285, "total_steps": 1850, "loss": 0.5932, "lr": 4.71485296820701e-06, "epoch": 1.5405405405405406, "percentage": 15.41, "elapsed_time": "1:18:04", "remaining_time": "7:08:41"} +{"current_steps": 286, "total_steps": 1850, "loss": 0.6296, "lr": 4.7128807713940245e-06, "epoch": 1.545945945945946, "percentage": 15.46, "elapsed_time": "1:18:09", "remaining_time": "7:07:26"} +{"current_steps": 287, "total_steps": 1850, "loss": 0.6201, "lr": 4.710902193207028e-06, "epoch": 1.5513513513513515, "percentage": 15.51, "elapsed_time": "1:18:15", "remaining_time": "7:06:12"} +{"current_steps": 288, "total_steps": 1850, "loss": 0.5682, "lr": 4.708917239351727e-06, "epoch": 1.5567567567567568, "percentage": 15.57, "elapsed_time": "1:18:22", "remaining_time": "7:05:06"} +{"current_steps": 289, "total_steps": 1850, "loss": 0.8877, "lr": 4.706925915552214e-06, "epoch": 1.5621621621621622, "percentage": 15.62, "elapsed_time": "1:18:24", "remaining_time": "7:03:29"} +{"current_steps": 290, "total_steps": 1850, "loss": 0.6521, "lr": 4.704928227550949e-06, "epoch": 1.5675675675675675, "percentage": 15.68, "elapsed_time": "1:18:28", "remaining_time": "7:02:07"} +{"current_steps": 291, "total_steps": 1850, "loss": 0.4929, "lr": 4.702924181108745e-06, "epoch": 1.572972972972973, "percentage": 15.73, "elapsed_time": "1:18:30", "remaining_time": "7:00:36"} +{"current_steps": 292, "total_steps": 1850, "loss": 0.4515, "lr": 4.700913782004755e-06, "epoch": 1.5783783783783782, "percentage": 15.78, "elapsed_time": "1:18:32", "remaining_time": "6:59:06"} +{"current_steps": 293, "total_steps": 1850, "loss": 0.5477, "lr": 4.698897036036446e-06, "epoch": 1.5837837837837838, "percentage": 15.84, "elapsed_time": "1:18:37", "remaining_time": "6:57:48"} +{"current_steps": 294, "total_steps": 1850, "loss": 0.9589, "lr": 4.696873949019591e-06, "epoch": 1.5891891891891892, "percentage": 15.89, "elapsed_time": "1:18:39", "remaining_time": "6:56:18"} +{"current_steps": 295, "total_steps": 1850, "loss": 0.4425, "lr": 4.694844526788248e-06, "epoch": 1.5945945945945947, "percentage": 15.95, "elapsed_time": "1:18:42", "remaining_time": "6:54:51"} +{"current_steps": 296, "total_steps": 1850, "loss": 0.4899, "lr": 4.692808775194745e-06, "epoch": 1.6, "percentage": 16.0, "elapsed_time": "1:18:48", "remaining_time": "6:53:43"} +{"current_steps": 297, "total_steps": 1850, "loss": 0.4884, "lr": 4.690766700109659e-06, "epoch": 1.6054054054054054, "percentage": 16.05, "elapsed_time": "1:18:53", "remaining_time": "6:52:29"} +{"current_steps": 298, "total_steps": 1850, "loss": 0.8977, "lr": 4.688718307421807e-06, "epoch": 1.6108108108108108, "percentage": 16.11, "elapsed_time": "1:18:55", "remaining_time": "6:51:02"} +{"current_steps": 299, "total_steps": 1850, "loss": 0.6833, "lr": 4.686663603038222e-06, "epoch": 1.6162162162162161, "percentage": 16.16, "elapsed_time": "1:18:56", "remaining_time": "6:49:29"} +{"current_steps": 300, "total_steps": 1850, "loss": 0.9141, "lr": 4.6846025928841365e-06, "epoch": 1.6216216216216215, "percentage": 16.22, "elapsed_time": "1:19:01", "remaining_time": "6:48:19"} +{"current_steps": 301, "total_steps": 1850, "loss": 0.5121, "lr": 4.6825352829029705e-06, "epoch": 1.627027027027027, "percentage": 16.27, "elapsed_time": "1:19:05", "remaining_time": "6:47:03"} +{"current_steps": 302, "total_steps": 1850, "loss": 0.5399, "lr": 4.68046167905631e-06, "epoch": 1.6324324324324324, "percentage": 16.32, "elapsed_time": "1:19:11", "remaining_time": "6:45:54"} +{"current_steps": 303, "total_steps": 1850, "loss": 0.7921, "lr": 4.678381787323889e-06, "epoch": 1.637837837837838, "percentage": 16.38, "elapsed_time": "1:19:15", "remaining_time": "6:44:42"} +{"current_steps": 304, "total_steps": 1850, "loss": 0.7178, "lr": 4.676295613703577e-06, "epoch": 1.6432432432432433, "percentage": 16.43, "elapsed_time": "1:19:19", "remaining_time": "6:43:26"} +{"current_steps": 305, "total_steps": 1850, "loss": 0.7162, "lr": 4.674203164211357e-06, "epoch": 1.6486486486486487, "percentage": 16.49, "elapsed_time": "1:19:22", "remaining_time": "6:42:02"} +{"current_steps": 306, "total_steps": 1850, "loss": 0.6539, "lr": 4.67210444488131e-06, "epoch": 1.654054054054054, "percentage": 16.54, "elapsed_time": "1:19:25", "remaining_time": "6:40:44"} +{"current_steps": 307, "total_steps": 1850, "loss": 0.7214, "lr": 4.669999461765599e-06, "epoch": 1.6594594594594594, "percentage": 16.59, "elapsed_time": "1:19:26", "remaining_time": "6:39:18"} +{"current_steps": 308, "total_steps": 1850, "loss": 0.7451, "lr": 4.6678882209344474e-06, "epoch": 1.6648648648648647, "percentage": 16.65, "elapsed_time": "1:19:28", "remaining_time": "6:37:54"} +{"current_steps": 309, "total_steps": 1850, "loss": 0.6464, "lr": 4.665770728476127e-06, "epoch": 1.6702702702702703, "percentage": 16.7, "elapsed_time": "1:19:32", "remaining_time": "6:36:39"} +{"current_steps": 310, "total_steps": 1850, "loss": 0.6669, "lr": 4.663646990496939e-06, "epoch": 1.6756756756756757, "percentage": 16.76, "elapsed_time": "1:19:37", "remaining_time": "6:35:35"} +{"current_steps": 311, "total_steps": 1850, "loss": 0.8972, "lr": 4.661517013121189e-06, "epoch": 1.6810810810810812, "percentage": 16.81, "elapsed_time": "1:19:41", "remaining_time": "6:34:19"} +{"current_steps": 312, "total_steps": 1850, "loss": 0.6286, "lr": 4.659380802491181e-06, "epoch": 1.6864864864864866, "percentage": 16.86, "elapsed_time": "1:19:41", "remaining_time": "6:32:51"} +{"current_steps": 313, "total_steps": 1850, "loss": 0.3631, "lr": 4.6572383647671915e-06, "epoch": 1.691891891891892, "percentage": 16.92, "elapsed_time": "1:19:44", "remaining_time": "6:31:32"} +{"current_steps": 314, "total_steps": 1850, "loss": 0.5682, "lr": 4.655089706127457e-06, "epoch": 1.6972972972972973, "percentage": 16.97, "elapsed_time": "1:19:46", "remaining_time": "6:30:16"} +{"current_steps": 315, "total_steps": 1850, "loss": 0.5457, "lr": 4.652934832768148e-06, "epoch": 1.7027027027027026, "percentage": 17.03, "elapsed_time": "1:19:52", "remaining_time": "6:29:14"} +{"current_steps": 316, "total_steps": 1850, "loss": 0.6601, "lr": 4.650773750903363e-06, "epoch": 1.708108108108108, "percentage": 17.08, "elapsed_time": "1:19:56", "remaining_time": "6:28:04"} +{"current_steps": 317, "total_steps": 1850, "loss": 0.5882, "lr": 4.6486064667651005e-06, "epoch": 1.7135135135135136, "percentage": 17.14, "elapsed_time": "1:19:58", "remaining_time": "6:26:46"} +{"current_steps": 318, "total_steps": 1850, "loss": 0.7628, "lr": 4.646432986603245e-06, "epoch": 1.718918918918919, "percentage": 17.19, "elapsed_time": "1:20:03", "remaining_time": "6:25:39"} +{"current_steps": 319, "total_steps": 1850, "loss": 0.6877, "lr": 4.644253316685552e-06, "epoch": 1.7243243243243245, "percentage": 17.24, "elapsed_time": "1:20:05", "remaining_time": "6:24:22"} +{"current_steps": 320, "total_steps": 1850, "loss": 0.7026, "lr": 4.6420674632976205e-06, "epoch": 1.7297297297297298, "percentage": 17.3, "elapsed_time": "1:20:08", "remaining_time": "6:23:10"} +{"current_steps": 321, "total_steps": 1850, "loss": 0.5236, "lr": 4.639875432742886e-06, "epoch": 1.7351351351351352, "percentage": 17.35, "elapsed_time": "1:20:09", "remaining_time": "6:21:48"} +{"current_steps": 322, "total_steps": 1850, "loss": 0.6463, "lr": 4.6376772313425975e-06, "epoch": 1.7405405405405405, "percentage": 17.41, "elapsed_time": "1:20:10", "remaining_time": "6:20:27"} +{"current_steps": 323, "total_steps": 1850, "loss": 0.6903, "lr": 4.635472865435795e-06, "epoch": 1.7459459459459459, "percentage": 17.46, "elapsed_time": "1:20:13", "remaining_time": "6:19:16"} +{"current_steps": 324, "total_steps": 1850, "loss": 0.7342, "lr": 4.6332623413792995e-06, "epoch": 1.7513513513513512, "percentage": 17.51, "elapsed_time": "1:20:16", "remaining_time": "6:18:02"} +{"current_steps": 325, "total_steps": 1850, "loss": 0.4302, "lr": 4.6310456655476874e-06, "epoch": 1.7567567567567568, "percentage": 17.57, "elapsed_time": "1:20:18", "remaining_time": "6:16:48"} +{"current_steps": 326, "total_steps": 1850, "loss": 0.5108, "lr": 4.6288228443332786e-06, "epoch": 1.7621621621621621, "percentage": 17.62, "elapsed_time": "1:20:20", "remaining_time": "6:15:32"} +{"current_steps": 327, "total_steps": 1850, "loss": 0.7646, "lr": 4.626593884146111e-06, "epoch": 1.7675675675675677, "percentage": 17.68, "elapsed_time": "1:20:23", "remaining_time": "6:14:25"} +{"current_steps": 328, "total_steps": 1850, "loss": 0.5529, "lr": 4.624358791413928e-06, "epoch": 1.772972972972973, "percentage": 17.73, "elapsed_time": "1:20:25", "remaining_time": "6:13:12"} +{"current_steps": 329, "total_steps": 1850, "loss": 0.609, "lr": 4.622117572582159e-06, "epoch": 1.7783783783783784, "percentage": 17.78, "elapsed_time": "1:20:29", "remaining_time": "6:12:07"} +{"current_steps": 330, "total_steps": 1850, "loss": 0.9146, "lr": 4.619870234113894e-06, "epoch": 1.7837837837837838, "percentage": 17.84, "elapsed_time": "1:20:30", "remaining_time": "6:10:50"} +{"current_steps": 331, "total_steps": 1850, "loss": 0.6887, "lr": 4.617616782489878e-06, "epoch": 1.7891891891891891, "percentage": 17.89, "elapsed_time": "1:20:36", "remaining_time": "6:09:53"} +{"current_steps": 332, "total_steps": 1850, "loss": 0.505, "lr": 4.615357224208477e-06, "epoch": 1.7945945945945945, "percentage": 17.95, "elapsed_time": "1:20:40", "remaining_time": "6:08:53"} +{"current_steps": 333, "total_steps": 1850, "loss": 0.8384, "lr": 4.613091565785674e-06, "epoch": 1.8, "percentage": 18.0, "elapsed_time": "1:20:44", "remaining_time": "6:07:47"} +{"current_steps": 334, "total_steps": 1850, "loss": 0.5512, "lr": 4.610819813755038e-06, "epoch": 1.8054054054054054, "percentage": 18.05, "elapsed_time": "1:20:50", "remaining_time": "6:06:56"} +{"current_steps": 335, "total_steps": 1850, "loss": 0.4877, "lr": 4.608541974667714e-06, "epoch": 1.810810810810811, "percentage": 18.11, "elapsed_time": "1:20:54", "remaining_time": "6:05:54"} +{"current_steps": 336, "total_steps": 1850, "loss": 0.5583, "lr": 4.606258055092397e-06, "epoch": 1.8162162162162163, "percentage": 18.16, "elapsed_time": "1:20:58", "remaining_time": "6:04:50"} +{"current_steps": 337, "total_steps": 1850, "loss": 0.5421, "lr": 4.603968061615321e-06, "epoch": 1.8216216216216217, "percentage": 18.22, "elapsed_time": "1:21:00", "remaining_time": "6:03:43"} +{"current_steps": 338, "total_steps": 1850, "loss": 0.942, "lr": 4.601672000840231e-06, "epoch": 1.827027027027027, "percentage": 18.27, "elapsed_time": "1:21:04", "remaining_time": "6:02:38"} +{"current_steps": 339, "total_steps": 1850, "loss": 0.3773, "lr": 4.5993698793883715e-06, "epoch": 1.8324324324324324, "percentage": 18.32, "elapsed_time": "1:21:06", "remaining_time": "6:01:30"} +{"current_steps": 340, "total_steps": 1850, "loss": 0.9694, "lr": 4.597061703898462e-06, "epoch": 1.8378378378378377, "percentage": 18.38, "elapsed_time": "1:21:08", "remaining_time": "6:00:21"} +{"current_steps": 341, "total_steps": 1850, "loss": 0.4667, "lr": 4.594747481026685e-06, "epoch": 1.8432432432432433, "percentage": 18.43, "elapsed_time": "1:21:12", "remaining_time": "5:59:20"} +{"current_steps": 342, "total_steps": 1850, "loss": 0.4267, "lr": 4.592427217446656e-06, "epoch": 1.8486486486486486, "percentage": 18.49, "elapsed_time": "1:21:13", "remaining_time": "5:58:08"} +{"current_steps": 343, "total_steps": 1850, "loss": 0.9245, "lr": 4.590100919849413e-06, "epoch": 1.8540540540540542, "percentage": 18.54, "elapsed_time": "1:21:16", "remaining_time": "5:57:03"} +{"current_steps": 344, "total_steps": 1850, "loss": 0.7502, "lr": 4.587768594943396e-06, "epoch": 1.8594594594594596, "percentage": 18.59, "elapsed_time": "1:21:23", "remaining_time": "5:56:19"} +{"current_steps": 345, "total_steps": 1850, "loss": 0.4689, "lr": 4.585430249454426e-06, "epoch": 1.864864864864865, "percentage": 18.65, "elapsed_time": "1:21:25", "remaining_time": "5:55:10"} +{"current_steps": 346, "total_steps": 1850, "loss": 0.6188, "lr": 4.583085890125682e-06, "epoch": 1.8702702702702703, "percentage": 18.7, "elapsed_time": "1:21:29", "remaining_time": "5:54:14"} +{"current_steps": 347, "total_steps": 1850, "loss": 0.6352, "lr": 4.5807355237176896e-06, "epoch": 1.8756756756756756, "percentage": 18.76, "elapsed_time": "1:21:35", "remaining_time": "5:53:25"} +{"current_steps": 348, "total_steps": 1850, "loss": 0.464, "lr": 4.578379157008296e-06, "epoch": 1.881081081081081, "percentage": 18.81, "elapsed_time": "1:21:38", "remaining_time": "5:52:24"} +{"current_steps": 349, "total_steps": 1850, "loss": 0.5943, "lr": 4.57601679679265e-06, "epoch": 1.8864864864864865, "percentage": 18.86, "elapsed_time": "1:21:46", "remaining_time": "5:51:41"} +{"current_steps": 350, "total_steps": 1850, "loss": 0.6949, "lr": 4.573648449883188e-06, "epoch": 1.8918918918918919, "percentage": 18.92, "elapsed_time": "1:21:49", "remaining_time": "5:50:41"} +{"current_steps": 351, "total_steps": 1850, "loss": 0.4333, "lr": 4.571274123109606e-06, "epoch": 1.8972972972972975, "percentage": 18.97, "elapsed_time": "1:21:51", "remaining_time": "5:49:33"} +{"current_steps": 352, "total_steps": 1850, "loss": 0.6796, "lr": 4.568893823318847e-06, "epoch": 1.9027027027027028, "percentage": 19.03, "elapsed_time": "1:21:57", "remaining_time": "5:48:47"} +{"current_steps": 353, "total_steps": 1850, "loss": 0.6139, "lr": 4.566507557375077e-06, "epoch": 1.9081081081081082, "percentage": 19.08, "elapsed_time": "1:22:01", "remaining_time": "5:47:50"} +{"current_steps": 354, "total_steps": 1850, "loss": 0.4515, "lr": 4.5641153321596684e-06, "epoch": 1.9135135135135135, "percentage": 19.14, "elapsed_time": "1:22:03", "remaining_time": "5:46:48"} +{"current_steps": 355, "total_steps": 1850, "loss": 0.8426, "lr": 4.56171715457118e-06, "epoch": 1.9189189189189189, "percentage": 19.19, "elapsed_time": "1:22:06", "remaining_time": "5:45:44"} +{"current_steps": 356, "total_steps": 1850, "loss": 0.5806, "lr": 4.559313031525331e-06, "epoch": 1.9243243243243242, "percentage": 19.24, "elapsed_time": "1:22:07", "remaining_time": "5:44:38"} +{"current_steps": 357, "total_steps": 1850, "loss": 0.5927, "lr": 4.55690296995499e-06, "epoch": 1.9297297297297298, "percentage": 19.3, "elapsed_time": "1:22:09", "remaining_time": "5:43:33"} +{"current_steps": 358, "total_steps": 1850, "loss": 0.9986, "lr": 4.554486976810149e-06, "epoch": 1.9351351351351351, "percentage": 19.35, "elapsed_time": "1:22:10", "remaining_time": "5:42:28"} +{"current_steps": 359, "total_steps": 1850, "loss": 0.6813, "lr": 4.552065059057906e-06, "epoch": 1.9405405405405407, "percentage": 19.41, "elapsed_time": "1:22:12", "remaining_time": "5:41:26"} +{"current_steps": 360, "total_steps": 1850, "loss": 1.0832, "lr": 4.549637223682441e-06, "epoch": 1.945945945945946, "percentage": 19.46, "elapsed_time": "1:22:14", "remaining_time": "5:40:24"} +{"current_steps": 361, "total_steps": 1850, "loss": 0.7377, "lr": 4.547203477685005e-06, "epoch": 1.9513513513513514, "percentage": 19.51, "elapsed_time": "1:22:17", "remaining_time": "5:39:25"} +{"current_steps": 362, "total_steps": 1850, "loss": 0.5412, "lr": 4.544763828083888e-06, "epoch": 1.9567567567567568, "percentage": 19.57, "elapsed_time": "1:22:22", "remaining_time": "5:38:36"} +{"current_steps": 363, "total_steps": 1850, "loss": 0.6955, "lr": 4.542318281914405e-06, "epoch": 1.962162162162162, "percentage": 19.62, "elapsed_time": "1:22:26", "remaining_time": "5:37:41"} +{"current_steps": 364, "total_steps": 1850, "loss": 0.6774, "lr": 4.53986684622888e-06, "epoch": 1.9675675675675675, "percentage": 19.68, "elapsed_time": "1:22:30", "remaining_time": "5:36:49"} +{"current_steps": 365, "total_steps": 1850, "loss": 0.5832, "lr": 4.537409528096615e-06, "epoch": 1.972972972972973, "percentage": 19.73, "elapsed_time": "1:22:31", "remaining_time": "5:35:46"} +{"current_steps": 366, "total_steps": 1850, "loss": 0.606, "lr": 4.534946334603879e-06, "epoch": 1.9783783783783784, "percentage": 19.78, "elapsed_time": "1:22:35", "remaining_time": "5:34:51"} +{"current_steps": 367, "total_steps": 1850, "loss": 0.4991, "lr": 4.532477272853882e-06, "epoch": 1.983783783783784, "percentage": 19.84, "elapsed_time": "1:22:38", "remaining_time": "5:33:57"} +{"current_steps": 368, "total_steps": 1850, "loss": 0.4442, "lr": 4.530002349966759e-06, "epoch": 1.9891891891891893, "percentage": 19.89, "elapsed_time": "1:22:40", "remaining_time": "5:32:55"} +{"current_steps": 369, "total_steps": 1850, "loss": 0.6566, "lr": 4.5275215730795445e-06, "epoch": 1.9945945945945946, "percentage": 19.95, "elapsed_time": "1:22:46", "remaining_time": "5:32:12"} +{"current_steps": 370, "total_steps": 1850, "loss": 0.5687, "lr": 4.525034949346156e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:22:47", "remaining_time": "5:31:09"} +{"current_steps": 371, "total_steps": 1850, "loss": 0.4458, "lr": 4.522542485937369e-06, "epoch": 2.0054054054054054, "percentage": 20.05, "elapsed_time": "1:48:41", "remaining_time": "7:13:18"} +{"current_steps": 372, "total_steps": 1850, "loss": 0.4418, "lr": 4.5200441900408045e-06, "epoch": 2.0108108108108107, "percentage": 20.11, "elapsed_time": "1:48:44", "remaining_time": "7:12:03"} +{"current_steps": 373, "total_steps": 1850, "loss": 0.7057, "lr": 4.517540068860898e-06, "epoch": 2.016216216216216, "percentage": 20.16, "elapsed_time": "1:48:47", "remaining_time": "7:10:48"} +{"current_steps": 374, "total_steps": 1850, "loss": 0.4491, "lr": 4.515030129618884e-06, "epoch": 2.0216216216216214, "percentage": 20.22, "elapsed_time": "1:48:54", "remaining_time": "7:09:50"} +{"current_steps": 375, "total_steps": 1850, "loss": 0.3571, "lr": 4.512514379552779e-06, "epoch": 2.027027027027027, "percentage": 20.27, "elapsed_time": "1:48:58", "remaining_time": "7:08:38"} +{"current_steps": 376, "total_steps": 1850, "loss": 0.5056, "lr": 4.509992825917352e-06, "epoch": 2.0324324324324325, "percentage": 20.32, "elapsed_time": "1:49:04", "remaining_time": "7:07:36"} +{"current_steps": 377, "total_steps": 1850, "loss": 0.6834, "lr": 4.507465475984109e-06, "epoch": 2.037837837837838, "percentage": 20.38, "elapsed_time": "1:49:06", "remaining_time": "7:06:18"} +{"current_steps": 378, "total_steps": 1850, "loss": 0.6726, "lr": 4.504932337041272e-06, "epoch": 2.0432432432432432, "percentage": 20.43, "elapsed_time": "1:49:09", "remaining_time": "7:05:05"} +{"current_steps": 379, "total_steps": 1850, "loss": 0.4032, "lr": 4.502393416393757e-06, "epoch": 2.0486486486486486, "percentage": 20.49, "elapsed_time": "1:49:11", "remaining_time": "7:03:49"} +{"current_steps": 380, "total_steps": 1850, "loss": 0.5442, "lr": 4.4998487213631515e-06, "epoch": 2.054054054054054, "percentage": 20.54, "elapsed_time": "1:49:15", "remaining_time": "7:02:37"} +{"current_steps": 381, "total_steps": 1850, "loss": 0.6181, "lr": 4.497298259287696e-06, "epoch": 2.0594594594594593, "percentage": 20.59, "elapsed_time": "1:49:16", "remaining_time": "7:01:19"} +{"current_steps": 382, "total_steps": 1850, "loss": 0.3829, "lr": 4.494742037522261e-06, "epoch": 2.064864864864865, "percentage": 20.65, "elapsed_time": "1:49:20", "remaining_time": "7:00:11"} +{"current_steps": 383, "total_steps": 1850, "loss": 0.4953, "lr": 4.4921800634383295e-06, "epoch": 2.0702702702702704, "percentage": 20.7, "elapsed_time": "1:49:22", "remaining_time": "6:58:54"} +{"current_steps": 384, "total_steps": 1850, "loss": 0.3254, "lr": 4.4896123444239655e-06, "epoch": 2.075675675675676, "percentage": 20.76, "elapsed_time": "1:49:23", "remaining_time": "6:57:38"} +{"current_steps": 385, "total_steps": 1850, "loss": 0.555, "lr": 4.487038887883809e-06, "epoch": 2.081081081081081, "percentage": 20.81, "elapsed_time": "1:49:27", "remaining_time": "6:56:29"} +{"current_steps": 386, "total_steps": 1850, "loss": 0.665, "lr": 4.484459701239038e-06, "epoch": 2.0864864864864865, "percentage": 20.86, "elapsed_time": "1:49:31", "remaining_time": "6:55:22"} +{"current_steps": 387, "total_steps": 1850, "loss": 0.2652, "lr": 4.481874791927358e-06, "epoch": 2.091891891891892, "percentage": 20.92, "elapsed_time": "1:49:34", "remaining_time": "6:54:13"} +{"current_steps": 388, "total_steps": 1850, "loss": 0.3811, "lr": 4.479284167402977e-06, "epoch": 2.097297297297297, "percentage": 20.97, "elapsed_time": "1:49:37", "remaining_time": "6:53:05"} +{"current_steps": 389, "total_steps": 1850, "loss": 0.2463, "lr": 4.476687835136585e-06, "epoch": 2.1027027027027025, "percentage": 21.03, "elapsed_time": "1:49:40", "remaining_time": "6:51:54"} +{"current_steps": 390, "total_steps": 1850, "loss": 0.5507, "lr": 4.47408580261533e-06, "epoch": 2.108108108108108, "percentage": 21.08, "elapsed_time": "1:49:45", "remaining_time": "6:50:54"} +{"current_steps": 391, "total_steps": 1850, "loss": 0.288, "lr": 4.471478077342798e-06, "epoch": 2.1135135135135137, "percentage": 21.14, "elapsed_time": "1:49:48", "remaining_time": "6:49:46"} +{"current_steps": 392, "total_steps": 1850, "loss": 0.5169, "lr": 4.468864666838994e-06, "epoch": 2.118918918918919, "percentage": 21.19, "elapsed_time": "1:49:52", "remaining_time": "6:48:41"} +{"current_steps": 393, "total_steps": 1850, "loss": 0.3327, "lr": 4.4662455786403125e-06, "epoch": 2.1243243243243244, "percentage": 21.24, "elapsed_time": "1:49:55", "remaining_time": "6:47:30"} +{"current_steps": 394, "total_steps": 1850, "loss": 0.3877, "lr": 4.463620820299528e-06, "epoch": 2.1297297297297297, "percentage": 21.3, "elapsed_time": "1:50:00", "remaining_time": "6:46:32"} +{"current_steps": 395, "total_steps": 1850, "loss": 0.5425, "lr": 4.4609903993857606e-06, "epoch": 2.135135135135135, "percentage": 21.35, "elapsed_time": "1:50:02", "remaining_time": "6:45:22"} +{"current_steps": 396, "total_steps": 1850, "loss": 0.5257, "lr": 4.458354323484462e-06, "epoch": 2.1405405405405404, "percentage": 21.41, "elapsed_time": "1:50:05", "remaining_time": "6:44:13"} +{"current_steps": 397, "total_steps": 1850, "loss": 0.3914, "lr": 4.45571260019739e-06, "epoch": 2.145945945945946, "percentage": 21.46, "elapsed_time": "1:50:08", "remaining_time": "6:43:05"} +{"current_steps": 398, "total_steps": 1850, "loss": 0.3455, "lr": 4.453065237142592e-06, "epoch": 2.1513513513513516, "percentage": 21.51, "elapsed_time": "1:50:09", "remaining_time": "6:41:54"} +{"current_steps": 399, "total_steps": 1850, "loss": 0.4652, "lr": 4.4504122419543745e-06, "epoch": 2.156756756756757, "percentage": 21.57, "elapsed_time": "1:50:15", "remaining_time": "6:40:58"} +{"current_steps": 400, "total_steps": 1850, "loss": 0.6343, "lr": 4.4477536222832865e-06, "epoch": 2.1621621621621623, "percentage": 21.62, "elapsed_time": "1:50:19", "remaining_time": "6:39:54"} +{"current_steps": 401, "total_steps": 1850, "loss": 0.6975, "lr": 4.445089385796099e-06, "epoch": 2.1675675675675676, "percentage": 21.68, "elapsed_time": "1:50:21", "remaining_time": "6:38:45"} +{"current_steps": 402, "total_steps": 1850, "loss": 0.5779, "lr": 4.442419540175778e-06, "epoch": 2.172972972972973, "percentage": 21.73, "elapsed_time": "1:50:24", "remaining_time": "6:37:41"} +{"current_steps": 403, "total_steps": 1850, "loss": 0.4541, "lr": 4.439744093121465e-06, "epoch": 2.1783783783783783, "percentage": 21.78, "elapsed_time": "1:50:30", "remaining_time": "6:36:45"} +{"current_steps": 404, "total_steps": 1850, "loss": 0.4078, "lr": 4.437063052348457e-06, "epoch": 2.1837837837837837, "percentage": 21.84, "elapsed_time": "1:50:32", "remaining_time": "6:35:37"} +{"current_steps": 405, "total_steps": 1850, "loss": 0.6759, "lr": 4.434376425588179e-06, "epoch": 2.189189189189189, "percentage": 21.89, "elapsed_time": "1:50:34", "remaining_time": "6:34:30"} +{"current_steps": 406, "total_steps": 1850, "loss": 0.2938, "lr": 4.431684220588163e-06, "epoch": 2.1945945945945944, "percentage": 21.95, "elapsed_time": "1:50:38", "remaining_time": "6:33:32"} +{"current_steps": 407, "total_steps": 1850, "loss": 0.676, "lr": 4.428986445112034e-06, "epoch": 2.2, "percentage": 22.0, "elapsed_time": "1:50:41", "remaining_time": "6:32:26"} +{"current_steps": 408, "total_steps": 1850, "loss": 0.1859, "lr": 4.426283106939474e-06, "epoch": 2.2054054054054055, "percentage": 22.05, "elapsed_time": "1:50:44", "remaining_time": "6:31:22"} +{"current_steps": 409, "total_steps": 1850, "loss": 0.2955, "lr": 4.423574213866209e-06, "epoch": 2.210810810810811, "percentage": 22.11, "elapsed_time": "1:50:45", "remaining_time": "6:30:13"} +{"current_steps": 410, "total_steps": 1850, "loss": 0.2262, "lr": 4.420859773703985e-06, "epoch": 2.2162162162162162, "percentage": 22.16, "elapsed_time": "1:50:48", "remaining_time": "6:29:10"} +{"current_steps": 411, "total_steps": 1850, "loss": 0.2273, "lr": 4.418139794280542e-06, "epoch": 2.2216216216216216, "percentage": 22.22, "elapsed_time": "1:50:49", "remaining_time": "6:28:00"} +{"current_steps": 412, "total_steps": 1850, "loss": 0.3282, "lr": 4.415414283439595e-06, "epoch": 2.227027027027027, "percentage": 22.27, "elapsed_time": "1:50:54", "remaining_time": "6:27:04"} +{"current_steps": 413, "total_steps": 1850, "loss": 0.3651, "lr": 4.4126832490408116e-06, "epoch": 2.2324324324324323, "percentage": 22.32, "elapsed_time": "1:51:00", "remaining_time": "6:26:14"} +{"current_steps": 414, "total_steps": 1850, "loss": 0.4052, "lr": 4.409946698959784e-06, "epoch": 2.237837837837838, "percentage": 22.38, "elapsed_time": "1:51:05", "remaining_time": "6:25:19"} +{"current_steps": 415, "total_steps": 1850, "loss": 0.4638, "lr": 4.4072046410880145e-06, "epoch": 2.2432432432432434, "percentage": 22.43, "elapsed_time": "1:51:07", "remaining_time": "6:24:15"} +{"current_steps": 416, "total_steps": 1850, "loss": 0.517, "lr": 4.404457083332887e-06, "epoch": 2.2486486486486488, "percentage": 22.49, "elapsed_time": "1:51:10", "remaining_time": "6:23:14"} +{"current_steps": 417, "total_steps": 1850, "loss": 0.6902, "lr": 4.401704033617643e-06, "epoch": 2.254054054054054, "percentage": 22.54, "elapsed_time": "1:51:13", "remaining_time": "6:22:14"} +{"current_steps": 418, "total_steps": 1850, "loss": 0.3552, "lr": 4.398945499881366e-06, "epoch": 2.2594594594594595, "percentage": 22.59, "elapsed_time": "1:51:18", "remaining_time": "6:21:17"} +{"current_steps": 419, "total_steps": 1850, "loss": 0.286, "lr": 4.396181490078949e-06, "epoch": 2.264864864864865, "percentage": 22.65, "elapsed_time": "1:51:22", "remaining_time": "6:20:21"} +{"current_steps": 420, "total_steps": 1850, "loss": 0.4036, "lr": 4.393412012181082e-06, "epoch": 2.27027027027027, "percentage": 22.7, "elapsed_time": "1:51:23", "remaining_time": "6:19:15"} +{"current_steps": 421, "total_steps": 1850, "loss": 0.8037, "lr": 4.390637074174219e-06, "epoch": 2.2756756756756755, "percentage": 22.76, "elapsed_time": "1:51:27", "remaining_time": "6:18:18"} +{"current_steps": 422, "total_steps": 1850, "loss": 0.2553, "lr": 4.387856684060561e-06, "epoch": 2.281081081081081, "percentage": 22.81, "elapsed_time": "1:51:29", "remaining_time": "6:17:16"} +{"current_steps": 423, "total_steps": 1850, "loss": 0.6222, "lr": 4.385070849858033e-06, "epoch": 2.2864864864864867, "percentage": 22.86, "elapsed_time": "1:51:32", "remaining_time": "6:16:17"} +{"current_steps": 424, "total_steps": 1850, "loss": 0.5326, "lr": 4.382279579600257e-06, "epoch": 2.291891891891892, "percentage": 22.92, "elapsed_time": "1:51:35", "remaining_time": "6:15:17"} +{"current_steps": 425, "total_steps": 1850, "loss": 0.5515, "lr": 4.379482881336532e-06, "epoch": 2.2972972972972974, "percentage": 22.97, "elapsed_time": "1:51:42", "remaining_time": "6:14:31"} +{"current_steps": 426, "total_steps": 1850, "loss": 0.6948, "lr": 4.376680763131811e-06, "epoch": 2.3027027027027027, "percentage": 23.03, "elapsed_time": "1:51:45", "remaining_time": "6:13:33"} +{"current_steps": 427, "total_steps": 1850, "loss": 0.2947, "lr": 4.373873233066676e-06, "epoch": 2.308108108108108, "percentage": 23.08, "elapsed_time": "1:51:49", "remaining_time": "6:12:39"} +{"current_steps": 428, "total_steps": 1850, "loss": 0.2261, "lr": 4.371060299237315e-06, "epoch": 2.3135135135135134, "percentage": 23.14, "elapsed_time": "1:51:50", "remaining_time": "6:11:35"} +{"current_steps": 429, "total_steps": 1850, "loss": 0.5398, "lr": 4.368241969755499e-06, "epoch": 2.3189189189189188, "percentage": 23.19, "elapsed_time": "1:51:53", "remaining_time": "6:10:38"} +{"current_steps": 430, "total_steps": 1850, "loss": 0.3301, "lr": 4.36541825274856e-06, "epoch": 2.3243243243243246, "percentage": 23.24, "elapsed_time": "1:51:56", "remaining_time": "6:09:39"} +{"current_steps": 431, "total_steps": 1850, "loss": 0.6064, "lr": 4.3625891563593635e-06, "epoch": 2.32972972972973, "percentage": 23.3, "elapsed_time": "1:51:59", "remaining_time": "6:08:43"} +{"current_steps": 432, "total_steps": 1850, "loss": 0.3897, "lr": 4.35975468874629e-06, "epoch": 2.3351351351351353, "percentage": 23.35, "elapsed_time": "1:52:05", "remaining_time": "6:07:54"} +{"current_steps": 433, "total_steps": 1850, "loss": 0.271, "lr": 4.356914858083211e-06, "epoch": 2.3405405405405406, "percentage": 23.41, "elapsed_time": "1:52:09", "remaining_time": "6:07:03"} +{"current_steps": 434, "total_steps": 1850, "loss": 0.3681, "lr": 4.354069672559458e-06, "epoch": 2.345945945945946, "percentage": 23.46, "elapsed_time": "1:52:12", "remaining_time": "6:06:06"} +{"current_steps": 435, "total_steps": 1850, "loss": 0.298, "lr": 4.35121914037981e-06, "epoch": 2.3513513513513513, "percentage": 23.51, "elapsed_time": "1:52:16", "remaining_time": "6:05:11"} +{"current_steps": 436, "total_steps": 1850, "loss": 0.3618, "lr": 4.348363269764462e-06, "epoch": 2.3567567567567567, "percentage": 23.57, "elapsed_time": "1:52:20", "remaining_time": "6:04:19"} +{"current_steps": 437, "total_steps": 1850, "loss": 0.8972, "lr": 4.345502068949003e-06, "epoch": 2.362162162162162, "percentage": 23.62, "elapsed_time": "1:52:23", "remaining_time": "6:03:23"} +{"current_steps": 438, "total_steps": 1850, "loss": 0.3939, "lr": 4.342635546184394e-06, "epoch": 2.3675675675675674, "percentage": 23.68, "elapsed_time": "1:52:29", "remaining_time": "6:02:38"} +{"current_steps": 439, "total_steps": 1850, "loss": 0.5462, "lr": 4.339763709736944e-06, "epoch": 2.372972972972973, "percentage": 23.73, "elapsed_time": "1:52:32", "remaining_time": "6:01:43"} +{"current_steps": 440, "total_steps": 1850, "loss": 0.5932, "lr": 4.336886567888283e-06, "epoch": 2.3783783783783785, "percentage": 23.78, "elapsed_time": "1:52:35", "remaining_time": "6:00:48"} +{"current_steps": 441, "total_steps": 1850, "loss": 0.4622, "lr": 4.334004128935342e-06, "epoch": 2.383783783783784, "percentage": 23.84, "elapsed_time": "1:52:37", "remaining_time": "5:59:50"} +{"current_steps": 442, "total_steps": 1850, "loss": 0.5997, "lr": 4.331116401190327e-06, "epoch": 2.389189189189189, "percentage": 23.89, "elapsed_time": "1:52:40", "remaining_time": "5:58:56"} +{"current_steps": 443, "total_steps": 1850, "loss": 0.3072, "lr": 4.328223392980696e-06, "epoch": 2.3945945945945946, "percentage": 23.95, "elapsed_time": "1:52:46", "remaining_time": "5:58:11"} +{"current_steps": 444, "total_steps": 1850, "loss": 0.5338, "lr": 4.325325112649134e-06, "epoch": 2.4, "percentage": 24.0, "elapsed_time": "1:52:48", "remaining_time": "5:57:14"} +{"current_steps": 445, "total_steps": 1850, "loss": 0.3266, "lr": 4.322421568553529e-06, "epoch": 2.4054054054054053, "percentage": 24.05, "elapsed_time": "1:52:51", "remaining_time": "5:56:20"} +{"current_steps": 446, "total_steps": 1850, "loss": 0.4064, "lr": 4.3195127690669494e-06, "epoch": 2.410810810810811, "percentage": 24.11, "elapsed_time": "1:52:55", "remaining_time": "5:55:27"} +{"current_steps": 447, "total_steps": 1850, "loss": 0.3856, "lr": 4.3165987225776186e-06, "epoch": 2.4162162162162164, "percentage": 24.16, "elapsed_time": "1:52:57", "remaining_time": "5:54:32"} +{"current_steps": 448, "total_steps": 1850, "loss": 0.4261, "lr": 4.313679437488889e-06, "epoch": 2.4216216216216218, "percentage": 24.22, "elapsed_time": "1:53:00", "remaining_time": "5:53:38"} +{"current_steps": 449, "total_steps": 1850, "loss": 0.4943, "lr": 4.310754922219223e-06, "epoch": 2.427027027027027, "percentage": 24.27, "elapsed_time": "1:53:03", "remaining_time": "5:52:45"} +{"current_steps": 450, "total_steps": 1850, "loss": 0.2874, "lr": 4.307825185202164e-06, "epoch": 2.4324324324324325, "percentage": 24.32, "elapsed_time": "1:53:07", "remaining_time": "5:51:55"} +{"current_steps": 451, "total_steps": 1850, "loss": 0.4218, "lr": 4.3048902348863116e-06, "epoch": 2.437837837837838, "percentage": 24.38, "elapsed_time": "1:53:10", "remaining_time": "5:51:05"} +{"current_steps": 452, "total_steps": 1850, "loss": 0.4204, "lr": 4.301950079735303e-06, "epoch": 2.443243243243243, "percentage": 24.43, "elapsed_time": "1:53:13", "remaining_time": "5:50:10"} +{"current_steps": 453, "total_steps": 1850, "loss": 0.5593, "lr": 4.299004728227782e-06, "epoch": 2.4486486486486485, "percentage": 24.49, "elapsed_time": "1:53:19", "remaining_time": "5:49:29"} +{"current_steps": 454, "total_steps": 1850, "loss": 0.4187, "lr": 4.2960541888573774e-06, "epoch": 2.454054054054054, "percentage": 24.54, "elapsed_time": "1:53:21", "remaining_time": "5:48:35"} +{"current_steps": 455, "total_steps": 1850, "loss": 0.4193, "lr": 4.29309847013268e-06, "epoch": 2.4594594594594597, "percentage": 24.59, "elapsed_time": "1:53:23", "remaining_time": "5:47:38"} +{"current_steps": 456, "total_steps": 1850, "loss": 0.7035, "lr": 4.290137580577216e-06, "epoch": 2.464864864864865, "percentage": 24.65, "elapsed_time": "1:53:24", "remaining_time": "5:46:41"} +{"current_steps": 457, "total_steps": 1850, "loss": 0.5877, "lr": 4.287171528729423e-06, "epoch": 2.4702702702702704, "percentage": 24.7, "elapsed_time": "1:53:28", "remaining_time": "5:45:53"} +{"current_steps": 458, "total_steps": 1850, "loss": 0.5309, "lr": 4.284200323142623e-06, "epoch": 2.4756756756756757, "percentage": 24.76, "elapsed_time": "1:53:30", "remaining_time": "5:45:00"} +{"current_steps": 459, "total_steps": 1850, "loss": 0.448, "lr": 4.281223972385004e-06, "epoch": 2.481081081081081, "percentage": 24.81, "elapsed_time": "1:53:32", "remaining_time": "5:44:04"} +{"current_steps": 460, "total_steps": 1850, "loss": 0.4453, "lr": 4.27824248503959e-06, "epoch": 2.4864864864864864, "percentage": 24.86, "elapsed_time": "1:53:36", "remaining_time": "5:43:18"} +{"current_steps": 461, "total_steps": 1850, "loss": 0.5582, "lr": 4.275255869704214e-06, "epoch": 2.4918918918918918, "percentage": 24.92, "elapsed_time": "1:53:42", "remaining_time": "5:42:36"} +{"current_steps": 462, "total_steps": 1850, "loss": 0.423, "lr": 4.272264134991503e-06, "epoch": 2.4972972972972975, "percentage": 24.97, "elapsed_time": "1:53:46", "remaining_time": "5:41:48"} +{"current_steps": 463, "total_steps": 1850, "loss": 0.271, "lr": 4.269267289528843e-06, "epoch": 2.5027027027027025, "percentage": 25.03, "elapsed_time": "1:53:50", "remaining_time": "5:41:00"} +{"current_steps": 464, "total_steps": 1850, "loss": 0.6459, "lr": 4.266265341958356e-06, "epoch": 2.5081081081081082, "percentage": 25.08, "elapsed_time": "1:53:51", "remaining_time": "5:40:06"} +{"current_steps": 465, "total_steps": 1850, "loss": 0.2959, "lr": 4.263258300936882e-06, "epoch": 2.5135135135135136, "percentage": 25.14, "elapsed_time": "1:53:52", "remaining_time": "5:39:11"} +{"current_steps": 466, "total_steps": 1850, "loss": 0.3418, "lr": 4.260246175135948e-06, "epoch": 2.518918918918919, "percentage": 25.19, "elapsed_time": "1:54:00", "remaining_time": "5:38:34"} +{"current_steps": 467, "total_steps": 1850, "loss": 0.3459, "lr": 4.257228973241742e-06, "epoch": 2.5243243243243243, "percentage": 25.24, "elapsed_time": "1:54:03", "remaining_time": "5:37:47"} +{"current_steps": 468, "total_steps": 1850, "loss": 0.4769, "lr": 4.254206703955092e-06, "epoch": 2.5297297297297296, "percentage": 25.3, "elapsed_time": "1:54:04", "remaining_time": "5:36:52"} +{"current_steps": 469, "total_steps": 1850, "loss": 0.6487, "lr": 4.251179375991438e-06, "epoch": 2.535135135135135, "percentage": 25.35, "elapsed_time": "1:54:08", "remaining_time": "5:36:05"} +{"current_steps": 470, "total_steps": 1850, "loss": 0.5547, "lr": 4.248146998080808e-06, "epoch": 2.5405405405405403, "percentage": 25.41, "elapsed_time": "1:54:11", "remaining_time": "5:35:17"} +{"current_steps": 471, "total_steps": 1850, "loss": 0.2965, "lr": 4.2451095789677945e-06, "epoch": 2.545945945945946, "percentage": 25.46, "elapsed_time": "1:54:15", "remaining_time": "5:34:30"} +{"current_steps": 472, "total_steps": 1850, "loss": 0.3831, "lr": 4.242067127411525e-06, "epoch": 2.5513513513513515, "percentage": 25.51, "elapsed_time": "1:54:17", "remaining_time": "5:33:40"} +{"current_steps": 473, "total_steps": 1850, "loss": 0.1756, "lr": 4.239019652185642e-06, "epoch": 2.556756756756757, "percentage": 25.57, "elapsed_time": "1:54:19", "remaining_time": "5:32:48"} +{"current_steps": 474, "total_steps": 1850, "loss": 0.5136, "lr": 4.2359671620782725e-06, "epoch": 2.562162162162162, "percentage": 25.62, "elapsed_time": "1:54:23", "remaining_time": "5:32:04"} +{"current_steps": 475, "total_steps": 1850, "loss": 0.6554, "lr": 4.232909665892005e-06, "epoch": 2.5675675675675675, "percentage": 25.68, "elapsed_time": "1:54:26", "remaining_time": "5:31:17"} +{"current_steps": 476, "total_steps": 1850, "loss": 0.3804, "lr": 4.229847172443866e-06, "epoch": 2.572972972972973, "percentage": 25.73, "elapsed_time": "1:54:28", "remaining_time": "5:30:25"} +{"current_steps": 477, "total_steps": 1850, "loss": 0.3338, "lr": 4.2267796905652926e-06, "epoch": 2.5783783783783782, "percentage": 25.78, "elapsed_time": "1:54:30", "remaining_time": "5:29:36"} +{"current_steps": 478, "total_steps": 1850, "loss": 0.6163, "lr": 4.223707229102105e-06, "epoch": 2.583783783783784, "percentage": 25.84, "elapsed_time": "1:54:32", "remaining_time": "5:28:46"} +{"current_steps": 479, "total_steps": 1850, "loss": 0.3005, "lr": 4.220629796914487e-06, "epoch": 2.589189189189189, "percentage": 25.89, "elapsed_time": "1:54:36", "remaining_time": "5:28:01"} +{"current_steps": 480, "total_steps": 1850, "loss": 0.56, "lr": 4.217547402876954e-06, "epoch": 2.5945945945945947, "percentage": 25.95, "elapsed_time": "1:54:37", "remaining_time": "5:27:10"} +{"current_steps": 481, "total_steps": 1850, "loss": 0.4512, "lr": 4.214460055878329e-06, "epoch": 2.6, "percentage": 26.0, "elapsed_time": "1:54:40", "remaining_time": "5:26:21"} +{"current_steps": 482, "total_steps": 1850, "loss": 0.3074, "lr": 4.211367764821722e-06, "epoch": 2.6054054054054054, "percentage": 26.05, "elapsed_time": "1:54:43", "remaining_time": "5:25:37"} +{"current_steps": 483, "total_steps": 1850, "loss": 0.6752, "lr": 4.208270538624497e-06, "epoch": 2.610810810810811, "percentage": 26.11, "elapsed_time": "1:54:48", "remaining_time": "5:24:55"} +{"current_steps": 484, "total_steps": 1850, "loss": 0.2347, "lr": 4.205168386218251e-06, "epoch": 2.616216216216216, "percentage": 26.16, "elapsed_time": "1:54:51", "remaining_time": "5:24:11"} +{"current_steps": 485, "total_steps": 1850, "loss": 0.5189, "lr": 4.2020613165487865e-06, "epoch": 2.6216216216216215, "percentage": 26.22, "elapsed_time": "1:54:53", "remaining_time": "5:23:22"} +{"current_steps": 486, "total_steps": 1850, "loss": 0.7739, "lr": 4.198949338576086e-06, "epoch": 2.627027027027027, "percentage": 26.27, "elapsed_time": "1:54:58", "remaining_time": "5:22:41"} +{"current_steps": 487, "total_steps": 1850, "loss": 0.3495, "lr": 4.1958324612742875e-06, "epoch": 2.6324324324324326, "percentage": 26.32, "elapsed_time": "1:55:03", "remaining_time": "5:22:00"} +{"current_steps": 488, "total_steps": 1850, "loss": 0.2257, "lr": 4.1927106936316564e-06, "epoch": 2.637837837837838, "percentage": 26.38, "elapsed_time": "1:55:04", "remaining_time": "5:21:11"} +{"current_steps": 489, "total_steps": 1850, "loss": 0.6708, "lr": 4.189584044650559e-06, "epoch": 2.6432432432432433, "percentage": 26.43, "elapsed_time": "1:55:07", "remaining_time": "5:20:24"} +{"current_steps": 490, "total_steps": 1850, "loss": 0.3126, "lr": 4.186452523347441e-06, "epoch": 2.6486486486486487, "percentage": 26.49, "elapsed_time": "1:55:08", "remaining_time": "5:19:35"} +{"current_steps": 491, "total_steps": 1850, "loss": 0.4219, "lr": 4.183316138752799e-06, "epoch": 2.654054054054054, "percentage": 26.54, "elapsed_time": "1:55:11", "remaining_time": "5:18:50"} +{"current_steps": 492, "total_steps": 1850, "loss": 0.3937, "lr": 4.180174899911149e-06, "epoch": 2.6594594594594594, "percentage": 26.59, "elapsed_time": "1:55:13", "remaining_time": "5:18:02"} +{"current_steps": 493, "total_steps": 1850, "loss": 0.4098, "lr": 4.177028815881012e-06, "epoch": 2.6648648648648647, "percentage": 26.65, "elapsed_time": "1:55:16", "remaining_time": "5:17:18"} +{"current_steps": 494, "total_steps": 1850, "loss": 0.3597, "lr": 4.173877895734875e-06, "epoch": 2.6702702702702705, "percentage": 26.7, "elapsed_time": "1:55:21", "remaining_time": "5:16:38"} +{"current_steps": 495, "total_steps": 1850, "loss": 0.3284, "lr": 4.1707221485591764e-06, "epoch": 2.6756756756756754, "percentage": 26.76, "elapsed_time": "1:55:25", "remaining_time": "5:15:56"} +{"current_steps": 496, "total_steps": 1850, "loss": 0.257, "lr": 4.167561583454272e-06, "epoch": 2.6810810810810812, "percentage": 26.81, "elapsed_time": "1:55:27", "remaining_time": "5:15:10"} +{"current_steps": 497, "total_steps": 1850, "loss": 0.1819, "lr": 4.164396209534411e-06, "epoch": 2.6864864864864866, "percentage": 26.86, "elapsed_time": "1:55:28", "remaining_time": "5:14:21"} +{"current_steps": 498, "total_steps": 1850, "loss": 0.7109, "lr": 4.161226035927711e-06, "epoch": 2.691891891891892, "percentage": 26.92, "elapsed_time": "1:55:33", "remaining_time": "5:13:43"} +{"current_steps": 499, "total_steps": 1850, "loss": 0.6297, "lr": 4.15805107177613e-06, "epoch": 2.6972972972972973, "percentage": 26.97, "elapsed_time": "1:55:36", "remaining_time": "5:13:01"} +{"current_steps": 500, "total_steps": 1850, "loss": 0.5195, "lr": 4.15487132623544e-06, "epoch": 2.7027027027027026, "percentage": 27.03, "elapsed_time": "1:55:39", "remaining_time": "5:12:17"} +{"current_steps": 501, "total_steps": 1850, "loss": 0.2528, "lr": 4.151686808475204e-06, "epoch": 2.708108108108108, "percentage": 27.08, "elapsed_time": "1:55:41", "remaining_time": "5:11:30"} +{"current_steps": 502, "total_steps": 1850, "loss": 0.5013, "lr": 4.148497527678744e-06, "epoch": 2.7135135135135133, "percentage": 27.14, "elapsed_time": "1:55:43", "remaining_time": "5:10:44"} +{"current_steps": 503, "total_steps": 1850, "loss": 0.4109, "lr": 4.145303493043118e-06, "epoch": 2.718918918918919, "percentage": 27.19, "elapsed_time": "1:55:47", "remaining_time": "5:10:05"} +{"current_steps": 504, "total_steps": 1850, "loss": 0.3197, "lr": 4.1421047137790935e-06, "epoch": 2.7243243243243245, "percentage": 27.24, "elapsed_time": "1:55:51", "remaining_time": "5:09:25"} +{"current_steps": 505, "total_steps": 1850, "loss": 0.6369, "lr": 4.13890119911112e-06, "epoch": 2.72972972972973, "percentage": 27.3, "elapsed_time": "1:55:54", "remaining_time": "5:08:43"} +{"current_steps": 506, "total_steps": 1850, "loss": 0.4581, "lr": 4.135692958277303e-06, "epoch": 2.735135135135135, "percentage": 27.35, "elapsed_time": "1:55:57", "remaining_time": "5:08:00"} +{"current_steps": 507, "total_steps": 1850, "loss": 0.6217, "lr": 4.132480000529375e-06, "epoch": 2.7405405405405405, "percentage": 27.41, "elapsed_time": "1:56:00", "remaining_time": "5:07:17"} +{"current_steps": 508, "total_steps": 1850, "loss": 0.4951, "lr": 4.129262335132676e-06, "epoch": 2.745945945945946, "percentage": 27.46, "elapsed_time": "1:56:07", "remaining_time": "5:06:44"} +{"current_steps": 509, "total_steps": 1850, "loss": 0.2185, "lr": 4.126039971366114e-06, "epoch": 2.7513513513513512, "percentage": 27.51, "elapsed_time": "1:56:09", "remaining_time": "5:06:02"} +{"current_steps": 510, "total_steps": 1850, "loss": 0.5428, "lr": 4.122812918522154e-06, "epoch": 2.756756756756757, "percentage": 27.57, "elapsed_time": "1:56:12", "remaining_time": "5:05:18"} +{"current_steps": 511, "total_steps": 1850, "loss": 0.5466, "lr": 4.119581185906776e-06, "epoch": 2.762162162162162, "percentage": 27.62, "elapsed_time": "1:56:13", "remaining_time": "5:04:32"} +{"current_steps": 512, "total_steps": 1850, "loss": 0.3803, "lr": 4.1163447828394595e-06, "epoch": 2.7675675675675677, "percentage": 27.68, "elapsed_time": "1:56:15", "remaining_time": "5:03:48"} +{"current_steps": 513, "total_steps": 1850, "loss": 0.2722, "lr": 4.113103718653152e-06, "epoch": 2.772972972972973, "percentage": 27.73, "elapsed_time": "1:56:18", "remaining_time": "5:03:07"} +{"current_steps": 514, "total_steps": 1850, "loss": 0.333, "lr": 4.10985800269424e-06, "epoch": 2.7783783783783784, "percentage": 27.78, "elapsed_time": "1:56:21", "remaining_time": "5:02:26"} +{"current_steps": 515, "total_steps": 1850, "loss": 0.2186, "lr": 4.106607644322529e-06, "epoch": 2.7837837837837838, "percentage": 27.84, "elapsed_time": "1:56:23", "remaining_time": "5:01:42"} +{"current_steps": 516, "total_steps": 1850, "loss": 0.6365, "lr": 4.103352652911207e-06, "epoch": 2.789189189189189, "percentage": 27.89, "elapsed_time": "1:56:25", "remaining_time": "5:00:58"} +{"current_steps": 517, "total_steps": 1850, "loss": 0.7261, "lr": 4.100093037846825e-06, "epoch": 2.7945945945945945, "percentage": 27.95, "elapsed_time": "1:56:29", "remaining_time": "5:00:22"} +{"current_steps": 518, "total_steps": 1850, "loss": 0.2767, "lr": 4.0968288085292675e-06, "epoch": 2.8, "percentage": 28.0, "elapsed_time": "1:56:32", "remaining_time": "4:59:40"} +{"current_steps": 519, "total_steps": 1850, "loss": 0.4743, "lr": 4.093559974371725e-06, "epoch": 2.8054054054054056, "percentage": 28.05, "elapsed_time": "1:56:35", "remaining_time": "4:59:00"} +{"current_steps": 520, "total_steps": 1850, "loss": 0.3789, "lr": 4.090286544800667e-06, "epoch": 2.810810810810811, "percentage": 28.11, "elapsed_time": "1:56:43", "remaining_time": "4:58:31"} +{"current_steps": 521, "total_steps": 1850, "loss": 0.6252, "lr": 4.087008529255815e-06, "epoch": 2.8162162162162163, "percentage": 28.16, "elapsed_time": "1:56:49", "remaining_time": "4:57:59"} +{"current_steps": 522, "total_steps": 1850, "loss": 0.3467, "lr": 4.083725937190115e-06, "epoch": 2.8216216216216217, "percentage": 28.22, "elapsed_time": "1:56:51", "remaining_time": "4:57:16"} +{"current_steps": 523, "total_steps": 1850, "loss": 0.3857, "lr": 4.0804387780697114e-06, "epoch": 2.827027027027027, "percentage": 28.27, "elapsed_time": "1:56:54", "remaining_time": "4:56:38"} +{"current_steps": 524, "total_steps": 1850, "loss": 0.4679, "lr": 4.077147061373918e-06, "epoch": 2.8324324324324324, "percentage": 28.32, "elapsed_time": "1:57:01", "remaining_time": "4:56:07"} +{"current_steps": 525, "total_steps": 1850, "loss": 0.2439, "lr": 4.073850796595192e-06, "epoch": 2.8378378378378377, "percentage": 28.38, "elapsed_time": "1:57:02", "remaining_time": "4:55:22"} +{"current_steps": 526, "total_steps": 1850, "loss": 0.435, "lr": 4.070549993239106e-06, "epoch": 2.8432432432432435, "percentage": 28.43, "elapsed_time": "1:57:04", "remaining_time": "4:54:41"} +{"current_steps": 527, "total_steps": 1850, "loss": 0.5022, "lr": 4.06724466082432e-06, "epoch": 2.8486486486486484, "percentage": 28.49, "elapsed_time": "1:57:09", "remaining_time": "4:54:07"} +{"current_steps": 528, "total_steps": 1850, "loss": 0.4282, "lr": 4.063934808882555e-06, "epoch": 2.854054054054054, "percentage": 28.54, "elapsed_time": "1:57:11", "remaining_time": "4:53:26"} +{"current_steps": 529, "total_steps": 1850, "loss": 0.3436, "lr": 4.0606204469585656e-06, "epoch": 2.8594594594594596, "percentage": 28.59, "elapsed_time": "1:57:13", "remaining_time": "4:52:43"} +{"current_steps": 530, "total_steps": 1850, "loss": 0.3889, "lr": 4.057301584610112e-06, "epoch": 2.864864864864865, "percentage": 28.65, "elapsed_time": "1:57:20", "remaining_time": "4:52:15"} +{"current_steps": 531, "total_steps": 1850, "loss": 0.4828, "lr": 4.053978231407931e-06, "epoch": 2.8702702702702703, "percentage": 28.7, "elapsed_time": "1:57:24", "remaining_time": "4:51:37"} +{"current_steps": 532, "total_steps": 1850, "loss": 0.5814, "lr": 4.0506503969357115e-06, "epoch": 2.8756756756756756, "percentage": 28.76, "elapsed_time": "1:57:29", "remaining_time": "4:51:04"} +{"current_steps": 533, "total_steps": 1850, "loss": 0.4768, "lr": 4.047318090790065e-06, "epoch": 2.881081081081081, "percentage": 28.81, "elapsed_time": "1:57:36", "remaining_time": "4:50:35"} +{"current_steps": 534, "total_steps": 1850, "loss": 0.4262, "lr": 4.043981322580498e-06, "epoch": 2.8864864864864863, "percentage": 28.86, "elapsed_time": "1:57:39", "remaining_time": "4:49:57"} +{"current_steps": 535, "total_steps": 1850, "loss": 0.421, "lr": 4.040640101929384e-06, "epoch": 2.891891891891892, "percentage": 28.92, "elapsed_time": "1:57:42", "remaining_time": "4:49:19"} +{"current_steps": 536, "total_steps": 1850, "loss": 0.4019, "lr": 4.037294438471936e-06, "epoch": 2.8972972972972975, "percentage": 28.97, "elapsed_time": "1:57:45", "remaining_time": "4:48:41"} +{"current_steps": 537, "total_steps": 1850, "loss": 0.4322, "lr": 4.033944341856181e-06, "epoch": 2.902702702702703, "percentage": 29.03, "elapsed_time": "1:57:47", "remaining_time": "4:48:01"} +{"current_steps": 538, "total_steps": 1850, "loss": 0.3841, "lr": 4.030589821742926e-06, "epoch": 2.908108108108108, "percentage": 29.08, "elapsed_time": "1:57:52", "remaining_time": "4:47:26"} +{"current_steps": 539, "total_steps": 1850, "loss": 0.7083, "lr": 4.0272308878057385e-06, "epoch": 2.9135135135135135, "percentage": 29.14, "elapsed_time": "1:57:55", "remaining_time": "4:46:49"} +{"current_steps": 540, "total_steps": 1850, "loss": 0.5688, "lr": 4.023867549730912e-06, "epoch": 2.918918918918919, "percentage": 29.19, "elapsed_time": "1:57:57", "remaining_time": "4:46:08"} +{"current_steps": 541, "total_steps": 1850, "loss": 0.5979, "lr": 4.020499817217441e-06, "epoch": 2.924324324324324, "percentage": 29.24, "elapsed_time": "1:57:58", "remaining_time": "4:45:28"} +{"current_steps": 542, "total_steps": 1850, "loss": 0.5034, "lr": 4.017127699976992e-06, "epoch": 2.92972972972973, "percentage": 29.3, "elapsed_time": "1:58:01", "remaining_time": "4:44:48"} +{"current_steps": 543, "total_steps": 1850, "loss": 0.6656, "lr": 4.013751207733877e-06, "epoch": 2.935135135135135, "percentage": 29.35, "elapsed_time": "1:58:05", "remaining_time": "4:44:15"} +{"current_steps": 544, "total_steps": 1850, "loss": 0.2789, "lr": 4.010370350225023e-06, "epoch": 2.9405405405405407, "percentage": 29.41, "elapsed_time": "1:58:08", "remaining_time": "4:43:38"} +{"current_steps": 545, "total_steps": 1850, "loss": 0.2163, "lr": 4.006985137199945e-06, "epoch": 2.945945945945946, "percentage": 29.46, "elapsed_time": "1:58:10", "remaining_time": "4:42:57"} +{"current_steps": 546, "total_steps": 1850, "loss": 0.4179, "lr": 4.00359557842072e-06, "epoch": 2.9513513513513514, "percentage": 29.51, "elapsed_time": "1:58:12", "remaining_time": "4:42:18"} +{"current_steps": 547, "total_steps": 1850, "loss": 0.4683, "lr": 4.000201683661958e-06, "epoch": 2.9567567567567568, "percentage": 29.57, "elapsed_time": "1:58:15", "remaining_time": "4:41:42"} +{"current_steps": 548, "total_steps": 1850, "loss": 0.3506, "lr": 3.996803462710766e-06, "epoch": 2.962162162162162, "percentage": 29.62, "elapsed_time": "1:58:19", "remaining_time": "4:41:08"} +{"current_steps": 549, "total_steps": 1850, "loss": 0.6582, "lr": 3.993400925366736e-06, "epoch": 2.9675675675675675, "percentage": 29.68, "elapsed_time": "1:58:21", "remaining_time": "4:40:29"} +{"current_steps": 550, "total_steps": 1850, "loss": 0.504, "lr": 3.989994081441902e-06, "epoch": 2.972972972972973, "percentage": 29.73, "elapsed_time": "1:58:28", "remaining_time": "4:40:01"} +{"current_steps": 551, "total_steps": 1850, "loss": 0.7362, "lr": 3.986582940760717e-06, "epoch": 2.9783783783783786, "percentage": 29.78, "elapsed_time": "1:58:31", "remaining_time": "4:39:24"} +{"current_steps": 552, "total_steps": 1850, "loss": 0.4116, "lr": 3.983167513160025e-06, "epoch": 2.983783783783784, "percentage": 29.84, "elapsed_time": "1:58:33", "remaining_time": "4:38:48"} +{"current_steps": 553, "total_steps": 1850, "loss": 0.2188, "lr": 3.979747808489036e-06, "epoch": 2.9891891891891893, "percentage": 29.89, "elapsed_time": "1:58:36", "remaining_time": "4:38:09"} +{"current_steps": 554, "total_steps": 1850, "loss": 0.7558, "lr": 3.976323836609289e-06, "epoch": 2.9945945945945946, "percentage": 29.95, "elapsed_time": "1:58:41", "remaining_time": "4:37:40"} +{"current_steps": 555, "total_steps": 1850, "loss": 0.6491, "lr": 3.9728956073946305e-06, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "1:58:43", "remaining_time": "4:37:00"} +{"current_steps": 556, "total_steps": 1850, "loss": 0.1625, "lr": 3.969463130731183e-06, "epoch": 3.0054054054054054, "percentage": 30.05, "elapsed_time": "2:03:17", "remaining_time": "4:46:57"} +{"current_steps": 557, "total_steps": 1850, "loss": 0.311, "lr": 3.966026416517321e-06, "epoch": 3.0108108108108107, "percentage": 30.11, "elapsed_time": "2:03:20", "remaining_time": "4:46:19"} +{"current_steps": 558, "total_steps": 1850, "loss": 0.5299, "lr": 3.962585474663636e-06, "epoch": 3.016216216216216, "percentage": 30.16, "elapsed_time": "2:03:24", "remaining_time": "4:45:43"} +{"current_steps": 559, "total_steps": 1850, "loss": 0.2718, "lr": 3.959140315092911e-06, "epoch": 3.0216216216216214, "percentage": 30.22, "elapsed_time": "2:03:31", "remaining_time": "4:45:15"} +{"current_steps": 560, "total_steps": 1850, "loss": 0.2954, "lr": 3.955690947740092e-06, "epoch": 3.027027027027027, "percentage": 30.27, "elapsed_time": "2:03:32", "remaining_time": "4:44:36"} +{"current_steps": 561, "total_steps": 1850, "loss": 0.2388, "lr": 3.95223738255226e-06, "epoch": 3.0324324324324325, "percentage": 30.32, "elapsed_time": "2:03:39", "remaining_time": "4:44:07"} +{"current_steps": 562, "total_steps": 1850, "loss": 0.2014, "lr": 3.9487796294886015e-06, "epoch": 3.037837837837838, "percentage": 30.38, "elapsed_time": "2:03:42", "remaining_time": "4:43:32"} +{"current_steps": 563, "total_steps": 1850, "loss": 0.2102, "lr": 3.945317698520379e-06, "epoch": 3.0432432432432432, "percentage": 30.43, "elapsed_time": "2:03:45", "remaining_time": "4:42:55"} +{"current_steps": 564, "total_steps": 1850, "loss": 0.499, "lr": 3.941851599630903e-06, "epoch": 3.0486486486486486, "percentage": 30.49, "elapsed_time": "2:03:49", "remaining_time": "4:42:20"} +{"current_steps": 565, "total_steps": 1850, "loss": 0.3392, "lr": 3.938381342815503e-06, "epoch": 3.054054054054054, "percentage": 30.54, "elapsed_time": "2:03:52", "remaining_time": "4:41:44"} +{"current_steps": 566, "total_steps": 1850, "loss": 0.1942, "lr": 3.934906938081499e-06, "epoch": 3.0594594594594593, "percentage": 30.59, "elapsed_time": "2:03:53", "remaining_time": "4:41:04"} +{"current_steps": 567, "total_steps": 1850, "loss": 0.1753, "lr": 3.931428395448174e-06, "epoch": 3.064864864864865, "percentage": 30.65, "elapsed_time": "2:03:56", "remaining_time": "4:40:27"} +{"current_steps": 568, "total_steps": 1850, "loss": 0.2959, "lr": 3.927945724946743e-06, "epoch": 3.0702702702702704, "percentage": 30.7, "elapsed_time": "2:04:02", "remaining_time": "4:39:57"} +{"current_steps": 569, "total_steps": 1850, "loss": 0.4625, "lr": 3.924458936620322e-06, "epoch": 3.075675675675676, "percentage": 30.76, "elapsed_time": "2:04:07", "remaining_time": "4:39:26"} +{"current_steps": 570, "total_steps": 1850, "loss": 0.2571, "lr": 3.920968040523904e-06, "epoch": 3.081081081081081, "percentage": 30.81, "elapsed_time": "2:04:13", "remaining_time": "4:38:57"} +{"current_steps": 571, "total_steps": 1850, "loss": 0.1438, "lr": 3.917473046724329e-06, "epoch": 3.0864864864864865, "percentage": 30.86, "elapsed_time": "2:04:16", "remaining_time": "4:38:20"} +{"current_steps": 572, "total_steps": 1850, "loss": 0.3572, "lr": 3.9139739653002525e-06, "epoch": 3.091891891891892, "percentage": 30.92, "elapsed_time": "2:04:19", "remaining_time": "4:37:46"} +{"current_steps": 573, "total_steps": 1850, "loss": 0.165, "lr": 3.910470806342117e-06, "epoch": 3.097297297297297, "percentage": 30.97, "elapsed_time": "2:04:22", "remaining_time": "4:37:11"} +{"current_steps": 574, "total_steps": 1850, "loss": 0.3209, "lr": 3.9069635799521245e-06, "epoch": 3.1027027027027025, "percentage": 31.03, "elapsed_time": "2:04:24", "remaining_time": "4:36:34"} +{"current_steps": 575, "total_steps": 1850, "loss": 0.1976, "lr": 3.903452296244204e-06, "epoch": 3.108108108108108, "percentage": 31.08, "elapsed_time": "2:04:27", "remaining_time": "4:35:57"} +{"current_steps": 576, "total_steps": 1850, "loss": 0.6074, "lr": 3.899936965343989e-06, "epoch": 3.1135135135135137, "percentage": 31.14, "elapsed_time": "2:04:30", "remaining_time": "4:35:22"} +{"current_steps": 577, "total_steps": 1850, "loss": 0.4051, "lr": 3.89641759738878e-06, "epoch": 3.118918918918919, "percentage": 31.19, "elapsed_time": "2:04:31", "remaining_time": "4:34:44"} +{"current_steps": 578, "total_steps": 1850, "loss": 0.3787, "lr": 3.892894202527523e-06, "epoch": 3.1243243243243244, "percentage": 31.24, "elapsed_time": "2:04:34", "remaining_time": "4:34:09"} +{"current_steps": 579, "total_steps": 1850, "loss": 0.0927, "lr": 3.8893667909207735e-06, "epoch": 3.1297297297297297, "percentage": 31.3, "elapsed_time": "2:04:36", "remaining_time": "4:33:33"} +{"current_steps": 580, "total_steps": 1850, "loss": 0.4706, "lr": 3.88583537274067e-06, "epoch": 3.135135135135135, "percentage": 31.35, "elapsed_time": "2:04:42", "remaining_time": "4:33:03"} +{"current_steps": 581, "total_steps": 1850, "loss": 0.3949, "lr": 3.8822999581709085e-06, "epoch": 3.1405405405405404, "percentage": 31.41, "elapsed_time": "2:04:44", "remaining_time": "4:32:26"} +{"current_steps": 582, "total_steps": 1850, "loss": 0.1971, "lr": 3.878760557406708e-06, "epoch": 3.145945945945946, "percentage": 31.46, "elapsed_time": "2:04:48", "remaining_time": "4:31:54"} +{"current_steps": 583, "total_steps": 1850, "loss": 0.5156, "lr": 3.875217180654779e-06, "epoch": 3.1513513513513516, "percentage": 31.51, "elapsed_time": "2:04:50", "remaining_time": "4:31:18"} +{"current_steps": 584, "total_steps": 1850, "loss": 0.3552, "lr": 3.871669838133303e-06, "epoch": 3.156756756756757, "percentage": 31.57, "elapsed_time": "2:04:52", "remaining_time": "4:30:41"} +{"current_steps": 585, "total_steps": 1850, "loss": 0.4369, "lr": 3.868118540071894e-06, "epoch": 3.1621621621621623, "percentage": 31.62, "elapsed_time": "2:04:53", "remaining_time": "4:30:04"} +{"current_steps": 586, "total_steps": 1850, "loss": 0.3694, "lr": 3.8645632967115755e-06, "epoch": 3.1675675675675676, "percentage": 31.68, "elapsed_time": "2:04:56", "remaining_time": "4:29:28"} +{"current_steps": 587, "total_steps": 1850, "loss": 0.3404, "lr": 3.861004118304746e-06, "epoch": 3.172972972972973, "percentage": 31.73, "elapsed_time": "2:04:59", "remaining_time": "4:28:56"} +{"current_steps": 588, "total_steps": 1850, "loss": 0.3086, "lr": 3.857441015115154e-06, "epoch": 3.1783783783783783, "percentage": 31.78, "elapsed_time": "2:05:03", "remaining_time": "4:28:23"} +{"current_steps": 589, "total_steps": 1850, "loss": 0.253, "lr": 3.8538739974178635e-06, "epoch": 3.1837837837837837, "percentage": 31.84, "elapsed_time": "2:05:07", "remaining_time": "4:27:52"} +{"current_steps": 590, "total_steps": 1850, "loss": 0.2436, "lr": 3.850303075499227e-06, "epoch": 3.189189189189189, "percentage": 31.89, "elapsed_time": "2:05:09", "remaining_time": "4:27:17"} +{"current_steps": 591, "total_steps": 1850, "loss": 0.328, "lr": 3.84672825965686e-06, "epoch": 3.1945945945945944, "percentage": 31.95, "elapsed_time": "2:05:11", "remaining_time": "4:26:42"} +{"current_steps": 592, "total_steps": 1850, "loss": 0.2687, "lr": 3.843149560199601e-06, "epoch": 3.2, "percentage": 32.0, "elapsed_time": "2:05:14", "remaining_time": "4:26:09"} +{"current_steps": 593, "total_steps": 1850, "loss": 0.1417, "lr": 3.839566987447492e-06, "epoch": 3.2054054054054055, "percentage": 32.05, "elapsed_time": "2:05:17", "remaining_time": "4:25:34"} +{"current_steps": 594, "total_steps": 1850, "loss": 0.2106, "lr": 3.835980551731743e-06, "epoch": 3.210810810810811, "percentage": 32.11, "elapsed_time": "2:05:20", "remaining_time": "4:25:00"} +{"current_steps": 595, "total_steps": 1850, "loss": 0.3154, "lr": 3.8323902633947045e-06, "epoch": 3.2162162162162162, "percentage": 32.16, "elapsed_time": "2:05:25", "remaining_time": "4:24:33"} +{"current_steps": 596, "total_steps": 1850, "loss": 0.1218, "lr": 3.828796132789835e-06, "epoch": 3.2216216216216216, "percentage": 32.22, "elapsed_time": "2:05:27", "remaining_time": "4:23:58"} +{"current_steps": 597, "total_steps": 1850, "loss": 0.1336, "lr": 3.825198170281677e-06, "epoch": 3.227027027027027, "percentage": 32.27, "elapsed_time": "2:05:29", "remaining_time": "4:23:22"} +{"current_steps": 598, "total_steps": 1850, "loss": 0.2518, "lr": 3.821596386245819e-06, "epoch": 3.2324324324324323, "percentage": 32.32, "elapsed_time": "2:05:33", "remaining_time": "4:22:51"} +{"current_steps": 599, "total_steps": 1850, "loss": 0.2762, "lr": 3.817990791068874e-06, "epoch": 3.237837837837838, "percentage": 32.38, "elapsed_time": "2:05:39", "remaining_time": "4:22:26"} +{"current_steps": 600, "total_steps": 1850, "loss": 0.2722, "lr": 3.81438139514844e-06, "epoch": 3.2432432432432434, "percentage": 32.43, "elapsed_time": "2:05:44", "remaining_time": "4:21:57"} +{"current_steps": 601, "total_steps": 1850, "loss": 0.3542, "lr": 3.8107682088930797e-06, "epoch": 3.2486486486486488, "percentage": 32.49, "elapsed_time": "2:05:46", "remaining_time": "4:21:23"} +{"current_steps": 602, "total_steps": 1850, "loss": 0.344, "lr": 3.807151242722286e-06, "epoch": 3.254054054054054, "percentage": 32.54, "elapsed_time": "2:05:49", "remaining_time": "4:20:49"} +{"current_steps": 603, "total_steps": 1850, "loss": 0.1625, "lr": 3.8035305070664484e-06, "epoch": 3.2594594594594595, "percentage": 32.59, "elapsed_time": "2:05:52", "remaining_time": "4:20:17"} +{"current_steps": 604, "total_steps": 1850, "loss": 0.2925, "lr": 3.7999060123668318e-06, "epoch": 3.264864864864865, "percentage": 32.65, "elapsed_time": "2:05:57", "remaining_time": "4:19:51"} +{"current_steps": 605, "total_steps": 1850, "loss": 0.1523, "lr": 3.7962777690755364e-06, "epoch": 3.27027027027027, "percentage": 32.7, "elapsed_time": "2:05:59", "remaining_time": "4:19:15"} +{"current_steps": 606, "total_steps": 1850, "loss": 0.1674, "lr": 3.792645787655476e-06, "epoch": 3.2756756756756755, "percentage": 32.76, "elapsed_time": "2:06:03", "remaining_time": "4:18:46"} +{"current_steps": 607, "total_steps": 1850, "loss": 0.2856, "lr": 3.7890100785803425e-06, "epoch": 3.281081081081081, "percentage": 32.81, "elapsed_time": "2:06:06", "remaining_time": "4:18:15"} +{"current_steps": 608, "total_steps": 1850, "loss": 0.1094, "lr": 3.785370652334577e-06, "epoch": 3.2864864864864867, "percentage": 32.86, "elapsed_time": "2:06:09", "remaining_time": "4:17:41"} +{"current_steps": 609, "total_steps": 1850, "loss": 0.2611, "lr": 3.7817275194133403e-06, "epoch": 3.291891891891892, "percentage": 32.92, "elapsed_time": "2:06:12", "remaining_time": "4:17:10"} +{"current_steps": 610, "total_steps": 1850, "loss": 0.1315, "lr": 3.778080690322483e-06, "epoch": 3.2972972972972974, "percentage": 32.97, "elapsed_time": "2:06:14", "remaining_time": "4:16:37"} +{"current_steps": 611, "total_steps": 1850, "loss": 0.1686, "lr": 3.774430175578514e-06, "epoch": 3.3027027027027027, "percentage": 33.03, "elapsed_time": "2:06:15", "remaining_time": "4:16:01"} +{"current_steps": 612, "total_steps": 1850, "loss": 0.4642, "lr": 3.7707759857085706e-06, "epoch": 3.308108108108108, "percentage": 33.08, "elapsed_time": "2:06:17", "remaining_time": "4:15:28"} +{"current_steps": 613, "total_steps": 1850, "loss": 0.1987, "lr": 3.7671181312503886e-06, "epoch": 3.3135135135135134, "percentage": 33.14, "elapsed_time": "2:06:19", "remaining_time": "4:14:54"} +{"current_steps": 614, "total_steps": 1850, "loss": 0.3307, "lr": 3.763456622752271e-06, "epoch": 3.3189189189189188, "percentage": 33.19, "elapsed_time": "2:06:21", "remaining_time": "4:14:22"} +{"current_steps": 615, "total_steps": 1850, "loss": 0.1731, "lr": 3.7597914707730583e-06, "epoch": 3.3243243243243246, "percentage": 33.24, "elapsed_time": "2:06:26", "remaining_time": "4:13:54"} +{"current_steps": 616, "total_steps": 1850, "loss": 0.2003, "lr": 3.7561226858820984e-06, "epoch": 3.32972972972973, "percentage": 33.3, "elapsed_time": "2:06:28", "remaining_time": "4:13:20"} +{"current_steps": 617, "total_steps": 1850, "loss": 0.4014, "lr": 3.7524502786592143e-06, "epoch": 3.3351351351351353, "percentage": 33.35, "elapsed_time": "2:06:30", "remaining_time": "4:12:47"} +{"current_steps": 618, "total_steps": 1850, "loss": 0.205, "lr": 3.7487742596946753e-06, "epoch": 3.3405405405405406, "percentage": 33.41, "elapsed_time": "2:06:33", "remaining_time": "4:12:17"} +{"current_steps": 619, "total_steps": 1850, "loss": 0.2932, "lr": 3.7450946395891674e-06, "epoch": 3.345945945945946, "percentage": 33.46, "elapsed_time": "2:06:36", "remaining_time": "4:11:47"} +{"current_steps": 620, "total_steps": 1850, "loss": 0.2748, "lr": 3.7414114289537593e-06, "epoch": 3.3513513513513513, "percentage": 33.51, "elapsed_time": "2:06:39", "remaining_time": "4:11:16"} +{"current_steps": 621, "total_steps": 1850, "loss": 0.3665, "lr": 3.7377246384098763e-06, "epoch": 3.3567567567567567, "percentage": 33.57, "elapsed_time": "2:06:43", "remaining_time": "4:10:47"} +{"current_steps": 622, "total_steps": 1850, "loss": 0.3453, "lr": 3.7340342785892645e-06, "epoch": 3.362162162162162, "percentage": 33.62, "elapsed_time": "2:06:44", "remaining_time": "4:10:14"} +{"current_steps": 623, "total_steps": 1850, "loss": 0.473, "lr": 3.7303403601339646e-06, "epoch": 3.3675675675675674, "percentage": 33.68, "elapsed_time": "2:06:51", "remaining_time": "4:09:50"} +{"current_steps": 624, "total_steps": 1850, "loss": 0.3017, "lr": 3.726642893696279e-06, "epoch": 3.372972972972973, "percentage": 33.73, "elapsed_time": "2:06:54", "remaining_time": "4:09:20"} +{"current_steps": 625, "total_steps": 1850, "loss": 0.4841, "lr": 3.7229418899387414e-06, "epoch": 3.3783783783783785, "percentage": 33.78, "elapsed_time": "2:06:55", "remaining_time": "4:08:46"} +{"current_steps": 626, "total_steps": 1850, "loss": 0.3879, "lr": 3.719237359534087e-06, "epoch": 3.383783783783784, "percentage": 33.84, "elapsed_time": "2:06:57", "remaining_time": "4:08:14"} +{"current_steps": 627, "total_steps": 1850, "loss": 0.3876, "lr": 3.71552931316522e-06, "epoch": 3.389189189189189, "percentage": 33.89, "elapsed_time": "2:07:05", "remaining_time": "4:07:53"} +{"current_steps": 628, "total_steps": 1850, "loss": 0.4491, "lr": 3.7118177615251834e-06, "epoch": 3.3945945945945946, "percentage": 33.95, "elapsed_time": "2:07:10", "remaining_time": "4:07:28"} +{"current_steps": 629, "total_steps": 1850, "loss": 0.3763, "lr": 3.70810271531713e-06, "epoch": 3.4, "percentage": 34.0, "elapsed_time": "2:07:18", "remaining_time": "4:07:07"} +{"current_steps": 630, "total_steps": 1850, "loss": 0.4171, "lr": 3.7043841852542884e-06, "epoch": 3.4054054054054053, "percentage": 34.05, "elapsed_time": "2:07:20", "remaining_time": "4:06:35"} +{"current_steps": 631, "total_steps": 1850, "loss": 0.2445, "lr": 3.700662182059936e-06, "epoch": 3.410810810810811, "percentage": 34.11, "elapsed_time": "2:07:21", "remaining_time": "4:06:02"} +{"current_steps": 632, "total_steps": 1850, "loss": 0.1347, "lr": 3.696936716467363e-06, "epoch": 3.4162162162162164, "percentage": 34.16, "elapsed_time": "2:07:26", "remaining_time": "4:05:36"} +{"current_steps": 633, "total_steps": 1850, "loss": 0.2822, "lr": 3.693207799219846e-06, "epoch": 3.4216216216216218, "percentage": 34.22, "elapsed_time": "2:07:33", "remaining_time": "4:05:13"} +{"current_steps": 634, "total_steps": 1850, "loss": 0.3425, "lr": 3.689475441070615e-06, "epoch": 3.427027027027027, "percentage": 34.27, "elapsed_time": "2:07:35", "remaining_time": "4:04:43"} +{"current_steps": 635, "total_steps": 1850, "loss": 0.3315, "lr": 3.685739652782822e-06, "epoch": 3.4324324324324325, "percentage": 34.32, "elapsed_time": "2:07:42", "remaining_time": "4:04:21"} +{"current_steps": 636, "total_steps": 1850, "loss": 0.1841, "lr": 3.682000445129512e-06, "epoch": 3.437837837837838, "percentage": 34.38, "elapsed_time": "2:07:44", "remaining_time": "4:03:49"} +{"current_steps": 637, "total_steps": 1850, "loss": 0.3151, "lr": 3.6782578288935896e-06, "epoch": 3.443243243243243, "percentage": 34.43, "elapsed_time": "2:07:47", "remaining_time": "4:03:20"} +{"current_steps": 638, "total_steps": 1850, "loss": 0.1272, "lr": 3.6745118148677882e-06, "epoch": 3.4486486486486485, "percentage": 34.49, "elapsed_time": "2:07:48", "remaining_time": "4:02:47"} +{"current_steps": 639, "total_steps": 1850, "loss": 0.2436, "lr": 3.6707624138546414e-06, "epoch": 3.454054054054054, "percentage": 34.54, "elapsed_time": "2:07:53", "remaining_time": "4:02:22"} +{"current_steps": 640, "total_steps": 1850, "loss": 0.6321, "lr": 3.6670096366664477e-06, "epoch": 3.4594594594594597, "percentage": 34.59, "elapsed_time": "2:07:57", "remaining_time": "4:01:55"} +{"current_steps": 641, "total_steps": 1850, "loss": 0.1262, "lr": 3.663253494125244e-06, "epoch": 3.464864864864865, "percentage": 34.65, "elapsed_time": "2:07:58", "remaining_time": "4:01:22"} +{"current_steps": 642, "total_steps": 1850, "loss": 0.2669, "lr": 3.6594939970627706e-06, "epoch": 3.4702702702702704, "percentage": 34.7, "elapsed_time": "2:08:01", "remaining_time": "4:00:54"} +{"current_steps": 643, "total_steps": 1850, "loss": 0.1228, "lr": 3.655731156320441e-06, "epoch": 3.4756756756756757, "percentage": 34.76, "elapsed_time": "2:08:05", "remaining_time": "4:00:26"} +{"current_steps": 644, "total_steps": 1850, "loss": 0.1759, "lr": 3.651964982749312e-06, "epoch": 3.481081081081081, "percentage": 34.81, "elapsed_time": "2:08:08", "remaining_time": "3:59:58"} +{"current_steps": 645, "total_steps": 1850, "loss": 0.5677, "lr": 3.648195487210051e-06, "epoch": 3.4864864864864864, "percentage": 34.86, "elapsed_time": "2:08:10", "remaining_time": "3:59:28"} +{"current_steps": 646, "total_steps": 1850, "loss": 0.1874, "lr": 3.644422680572906e-06, "epoch": 3.4918918918918918, "percentage": 34.92, "elapsed_time": "2:08:13", "remaining_time": "3:58:58"} +{"current_steps": 647, "total_steps": 1850, "loss": 0.3225, "lr": 3.640646573717671e-06, "epoch": 3.4972972972972975, "percentage": 34.97, "elapsed_time": "2:08:19", "remaining_time": "3:58:36"} +{"current_steps": 648, "total_steps": 1850, "loss": 0.102, "lr": 3.63686717753366e-06, "epoch": 3.5027027027027025, "percentage": 35.03, "elapsed_time": "2:08:21", "remaining_time": "3:58:06"} +{"current_steps": 649, "total_steps": 1850, "loss": 0.1585, "lr": 3.6330845029196697e-06, "epoch": 3.5081081081081082, "percentage": 35.08, "elapsed_time": "2:08:26", "remaining_time": "3:57:40"} +{"current_steps": 650, "total_steps": 1850, "loss": 0.3046, "lr": 3.629298560783952e-06, "epoch": 3.5135135135135136, "percentage": 35.14, "elapsed_time": "2:08:29", "remaining_time": "3:57:12"} +{"current_steps": 651, "total_steps": 1850, "loss": 0.2037, "lr": 3.6255093620441835e-06, "epoch": 3.518918918918919, "percentage": 35.19, "elapsed_time": "2:08:31", "remaining_time": "3:56:42"} +{"current_steps": 652, "total_steps": 1850, "loss": 0.1784, "lr": 3.6217169176274293e-06, "epoch": 3.5243243243243243, "percentage": 35.24, "elapsed_time": "2:08:33", "remaining_time": "3:56:13"} +{"current_steps": 653, "total_steps": 1850, "loss": 0.1974, "lr": 3.6179212384701146e-06, "epoch": 3.5297297297297296, "percentage": 35.3, "elapsed_time": "2:08:35", "remaining_time": "3:55:43"} +{"current_steps": 654, "total_steps": 1850, "loss": 0.2161, "lr": 3.6141223355179946e-06, "epoch": 3.535135135135135, "percentage": 35.35, "elapsed_time": "2:08:38", "remaining_time": "3:55:14"} +{"current_steps": 655, "total_steps": 1850, "loss": 0.1487, "lr": 3.610320219726118e-06, "epoch": 3.5405405405405403, "percentage": 35.41, "elapsed_time": "2:08:42", "remaining_time": "3:54:48"} +{"current_steps": 656, "total_steps": 1850, "loss": 0.2231, "lr": 3.606514902058802e-06, "epoch": 3.545945945945946, "percentage": 35.46, "elapsed_time": "2:08:43", "remaining_time": "3:54:18"} +{"current_steps": 657, "total_steps": 1850, "loss": 0.5068, "lr": 3.602706393489594e-06, "epoch": 3.5513513513513515, "percentage": 35.51, "elapsed_time": "2:08:48", "remaining_time": "3:53:53"} +{"current_steps": 658, "total_steps": 1850, "loss": 0.4621, "lr": 3.598894705001246e-06, "epoch": 3.556756756756757, "percentage": 35.57, "elapsed_time": "2:08:53", "remaining_time": "3:53:30"} +{"current_steps": 659, "total_steps": 1850, "loss": 0.285, "lr": 3.5950798475856783e-06, "epoch": 3.562162162162162, "percentage": 35.62, "elapsed_time": "2:08:57", "remaining_time": "3:53:03"} +{"current_steps": 660, "total_steps": 1850, "loss": 0.4277, "lr": 3.5912618322439487e-06, "epoch": 3.5675675675675675, "percentage": 35.68, "elapsed_time": "2:09:00", "remaining_time": "3:52:37"} +{"current_steps": 661, "total_steps": 1850, "loss": 0.1993, "lr": 3.587440669986224e-06, "epoch": 3.572972972972973, "percentage": 35.73, "elapsed_time": "2:09:03", "remaining_time": "3:52:09"} +{"current_steps": 662, "total_steps": 1850, "loss": 0.272, "lr": 3.5836163718317453e-06, "epoch": 3.5783783783783782, "percentage": 35.78, "elapsed_time": "2:09:09", "remaining_time": "3:51:46"} +{"current_steps": 663, "total_steps": 1850, "loss": 0.6019, "lr": 3.5797889488087946e-06, "epoch": 3.583783783783784, "percentage": 35.84, "elapsed_time": "2:09:12", "remaining_time": "3:51:18"} +{"current_steps": 664, "total_steps": 1850, "loss": 0.3603, "lr": 3.575958411954668e-06, "epoch": 3.589189189189189, "percentage": 35.89, "elapsed_time": "2:09:15", "remaining_time": "3:50:52"} +{"current_steps": 665, "total_steps": 1850, "loss": 0.4656, "lr": 3.5721247723156393e-06, "epoch": 3.5945945945945947, "percentage": 35.95, "elapsed_time": "2:09:16", "remaining_time": "3:50:22"} +{"current_steps": 666, "total_steps": 1850, "loss": 0.2466, "lr": 3.5682880409469316e-06, "epoch": 3.6, "percentage": 36.0, "elapsed_time": "2:09:19", "remaining_time": "3:49:54"} +{"current_steps": 667, "total_steps": 1850, "loss": 0.1848, "lr": 3.564448228912682e-06, "epoch": 3.6054054054054054, "percentage": 36.05, "elapsed_time": "2:09:23", "remaining_time": "3:49:30"} +{"current_steps": 668, "total_steps": 1850, "loss": 0.4968, "lr": 3.5606053472859124e-06, "epoch": 3.610810810810811, "percentage": 36.11, "elapsed_time": "2:09:27", "remaining_time": "3:49:04"} +{"current_steps": 669, "total_steps": 1850, "loss": 0.316, "lr": 3.556759407148496e-06, "epoch": 3.616216216216216, "percentage": 36.16, "elapsed_time": "2:09:29", "remaining_time": "3:48:35"} +{"current_steps": 670, "total_steps": 1850, "loss": 0.2232, "lr": 3.5529104195911258e-06, "epoch": 3.6216216216216215, "percentage": 36.22, "elapsed_time": "2:09:32", "remaining_time": "3:48:08"} +{"current_steps": 671, "total_steps": 1850, "loss": 0.4435, "lr": 3.549058395713285e-06, "epoch": 3.627027027027027, "percentage": 36.27, "elapsed_time": "2:09:35", "remaining_time": "3:47:42"} +{"current_steps": 672, "total_steps": 1850, "loss": 0.1455, "lr": 3.54520334662321e-06, "epoch": 3.6324324324324326, "percentage": 36.32, "elapsed_time": "2:09:37", "remaining_time": "3:47:14"} +{"current_steps": 673, "total_steps": 1850, "loss": 0.3037, "lr": 3.5413452834378626e-06, "epoch": 3.637837837837838, "percentage": 36.38, "elapsed_time": "2:09:42", "remaining_time": "3:46:50"} +{"current_steps": 674, "total_steps": 1850, "loss": 0.4309, "lr": 3.5374842172828953e-06, "epoch": 3.6432432432432433, "percentage": 36.43, "elapsed_time": "2:09:44", "remaining_time": "3:46:22"} +{"current_steps": 675, "total_steps": 1850, "loss": 0.383, "lr": 3.533620159292621e-06, "epoch": 3.6486486486486487, "percentage": 36.49, "elapsed_time": "2:09:47", "remaining_time": "3:45:56"} +{"current_steps": 676, "total_steps": 1850, "loss": 0.1963, "lr": 3.529753120609982e-06, "epoch": 3.654054054054054, "percentage": 36.54, "elapsed_time": "2:09:51", "remaining_time": "3:45:31"} +{"current_steps": 677, "total_steps": 1850, "loss": 0.1922, "lr": 3.5258831123865136e-06, "epoch": 3.6594594594594594, "percentage": 36.59, "elapsed_time": "2:09:53", "remaining_time": "3:45:03"} +{"current_steps": 678, "total_steps": 1850, "loss": 0.5589, "lr": 3.5220101457823147e-06, "epoch": 3.6648648648648647, "percentage": 36.65, "elapsed_time": "2:09:56", "remaining_time": "3:44:37"} +{"current_steps": 679, "total_steps": 1850, "loss": 0.1757, "lr": 3.5181342319660174e-06, "epoch": 3.6702702702702705, "percentage": 36.7, "elapsed_time": "2:09:58", "remaining_time": "3:44:09"} +{"current_steps": 680, "total_steps": 1850, "loss": 0.1208, "lr": 3.5142553821147498e-06, "epoch": 3.6756756756756754, "percentage": 36.76, "elapsed_time": "2:09:59", "remaining_time": "3:43:39"} +{"current_steps": 681, "total_steps": 1850, "loss": 0.2416, "lr": 3.5103736074141106e-06, "epoch": 3.6810810810810812, "percentage": 36.81, "elapsed_time": "2:10:03", "remaining_time": "3:43:14"} +{"current_steps": 682, "total_steps": 1850, "loss": 0.3841, "lr": 3.5064889190581293e-06, "epoch": 3.6864864864864866, "percentage": 36.86, "elapsed_time": "2:10:04", "remaining_time": "3:42:46"} +{"current_steps": 683, "total_steps": 1850, "loss": 0.3723, "lr": 3.5026013282492406e-06, "epoch": 3.691891891891892, "percentage": 36.92, "elapsed_time": "2:10:07", "remaining_time": "3:42:21"} +{"current_steps": 684, "total_steps": 1850, "loss": 0.4403, "lr": 3.498710846198247e-06, "epoch": 3.6972972972972973, "percentage": 36.97, "elapsed_time": "2:10:12", "remaining_time": "3:41:57"} +{"current_steps": 685, "total_steps": 1850, "loss": 0.2813, "lr": 3.494817484124289e-06, "epoch": 3.7027027027027026, "percentage": 37.03, "elapsed_time": "2:10:16", "remaining_time": "3:41:34"} +{"current_steps": 686, "total_steps": 1850, "loss": 0.4287, "lr": 3.490921253254813e-06, "epoch": 3.708108108108108, "percentage": 37.08, "elapsed_time": "2:10:18", "remaining_time": "3:41:06"} +{"current_steps": 687, "total_steps": 1850, "loss": 0.234, "lr": 3.487022164825539e-06, "epoch": 3.7135135135135133, "percentage": 37.14, "elapsed_time": "2:10:23", "remaining_time": "3:40:44"} +{"current_steps": 688, "total_steps": 1850, "loss": 0.2135, "lr": 3.4831202300804246e-06, "epoch": 3.718918918918919, "percentage": 37.19, "elapsed_time": "2:10:29", "remaining_time": "3:40:23"} +{"current_steps": 689, "total_steps": 1850, "loss": 0.2725, "lr": 3.479215460271638e-06, "epoch": 3.7243243243243245, "percentage": 37.24, "elapsed_time": "2:10:32", "remaining_time": "3:39:58"} +{"current_steps": 690, "total_steps": 1850, "loss": 0.228, "lr": 3.475307866659522e-06, "epoch": 3.72972972972973, "percentage": 37.3, "elapsed_time": "2:10:36", "remaining_time": "3:39:33"} +{"current_steps": 691, "total_steps": 1850, "loss": 0.0985, "lr": 3.4713974605125634e-06, "epoch": 3.735135135135135, "percentage": 37.35, "elapsed_time": "2:10:39", "remaining_time": "3:39:08"} +{"current_steps": 692, "total_steps": 1850, "loss": 0.2137, "lr": 3.4674842531073587e-06, "epoch": 3.7405405405405405, "percentage": 37.41, "elapsed_time": "2:10:43", "remaining_time": "3:38:45"} +{"current_steps": 693, "total_steps": 1850, "loss": 0.1707, "lr": 3.4635682557285833e-06, "epoch": 3.745945945945946, "percentage": 37.46, "elapsed_time": "2:10:45", "remaining_time": "3:38:17"} +{"current_steps": 694, "total_steps": 1850, "loss": 0.3021, "lr": 3.459649479668956e-06, "epoch": 3.7513513513513512, "percentage": 37.51, "elapsed_time": "2:10:50", "remaining_time": "3:37:56"} +{"current_steps": 695, "total_steps": 1850, "loss": 0.3457, "lr": 3.4557279362292117e-06, "epoch": 3.756756756756757, "percentage": 37.57, "elapsed_time": "2:10:51", "remaining_time": "3:37:28"} +{"current_steps": 696, "total_steps": 1850, "loss": 0.1193, "lr": 3.451803636718064e-06, "epoch": 3.762162162162162, "percentage": 37.62, "elapsed_time": "2:10:53", "remaining_time": "3:37:00"} +{"current_steps": 697, "total_steps": 1850, "loss": 0.2261, "lr": 3.447876592452174e-06, "epoch": 3.7675675675675677, "percentage": 37.68, "elapsed_time": "2:10:54", "remaining_time": "3:36:33"} +{"current_steps": 698, "total_steps": 1850, "loss": 0.5042, "lr": 3.4439468147561196e-06, "epoch": 3.772972972972973, "percentage": 37.73, "elapsed_time": "2:10:56", "remaining_time": "3:36:07"} +{"current_steps": 699, "total_steps": 1850, "loss": 0.3481, "lr": 3.440014314962358e-06, "epoch": 3.7783783783783784, "percentage": 37.78, "elapsed_time": "2:11:00", "remaining_time": "3:35:42"} +{"current_steps": 700, "total_steps": 1850, "loss": 0.2317, "lr": 3.4360791044112e-06, "epoch": 3.7837837837837838, "percentage": 37.84, "elapsed_time": "2:11:02", "remaining_time": "3:35:16"} +{"current_steps": 701, "total_steps": 1850, "loss": 0.395, "lr": 3.432141194450772e-06, "epoch": 3.789189189189189, "percentage": 37.89, "elapsed_time": "2:11:04", "remaining_time": "3:34:49"} +{"current_steps": 702, "total_steps": 1850, "loss": 0.1767, "lr": 3.4282005964369836e-06, "epoch": 3.7945945945945945, "percentage": 37.95, "elapsed_time": "2:11:06", "remaining_time": "3:34:24"} +{"current_steps": 703, "total_steps": 1850, "loss": 0.2146, "lr": 3.424257321733497e-06, "epoch": 3.8, "percentage": 38.0, "elapsed_time": "2:11:08", "remaining_time": "3:33:58"} +{"current_steps": 704, "total_steps": 1850, "loss": 0.1534, "lr": 3.4203113817116955e-06, "epoch": 3.8054054054054056, "percentage": 38.05, "elapsed_time": "2:11:11", "remaining_time": "3:33:34"} +{"current_steps": 705, "total_steps": 1850, "loss": 0.2513, "lr": 3.4163627877506434e-06, "epoch": 3.810810810810811, "percentage": 38.11, "elapsed_time": "2:11:14", "remaining_time": "3:33:08"} +{"current_steps": 706, "total_steps": 1850, "loss": 0.4154, "lr": 3.4124115512370636e-06, "epoch": 3.8162162162162163, "percentage": 38.16, "elapsed_time": "2:11:16", "remaining_time": "3:32:43"} +{"current_steps": 707, "total_steps": 1850, "loss": 0.1822, "lr": 3.408457683565295e-06, "epoch": 3.8216216216216217, "percentage": 38.22, "elapsed_time": "2:11:20", "remaining_time": "3:32:20"} +{"current_steps": 708, "total_steps": 1850, "loss": 0.3589, "lr": 3.4045011961372675e-06, "epoch": 3.827027027027027, "percentage": 38.27, "elapsed_time": "2:11:24", "remaining_time": "3:31:57"} +{"current_steps": 709, "total_steps": 1850, "loss": 0.4615, "lr": 3.4005421003624637e-06, "epoch": 3.8324324324324324, "percentage": 38.32, "elapsed_time": "2:11:27", "remaining_time": "3:31:34"} +{"current_steps": 710, "total_steps": 1850, "loss": 0.1001, "lr": 3.3965804076578896e-06, "epoch": 3.8378378378378377, "percentage": 38.38, "elapsed_time": "2:11:31", "remaining_time": "3:31:11"} +{"current_steps": 711, "total_steps": 1850, "loss": 0.2788, "lr": 3.392616129448039e-06, "epoch": 3.8432432432432435, "percentage": 38.43, "elapsed_time": "2:11:38", "remaining_time": "3:30:53"} +{"current_steps": 712, "total_steps": 1850, "loss": 0.2663, "lr": 3.3886492771648593e-06, "epoch": 3.8486486486486484, "percentage": 38.49, "elapsed_time": "2:11:42", "remaining_time": "3:30:29"} +{"current_steps": 713, "total_steps": 1850, "loss": 0.3497, "lr": 3.384679862247726e-06, "epoch": 3.854054054054054, "percentage": 38.54, "elapsed_time": "2:11:45", "remaining_time": "3:30:06"} +{"current_steps": 714, "total_steps": 1850, "loss": 0.3613, "lr": 3.3807078961434013e-06, "epoch": 3.8594594594594596, "percentage": 38.59, "elapsed_time": "2:11:49", "remaining_time": "3:29:44"} +{"current_steps": 715, "total_steps": 1850, "loss": 0.0783, "lr": 3.376733390306004e-06, "epoch": 3.864864864864865, "percentage": 38.65, "elapsed_time": "2:11:50", "remaining_time": "3:29:17"} +{"current_steps": 716, "total_steps": 1850, "loss": 0.1617, "lr": 3.372756356196979e-06, "epoch": 3.8702702702702703, "percentage": 38.7, "elapsed_time": "2:11:54", "remaining_time": "3:28:54"} +{"current_steps": 717, "total_steps": 1850, "loss": 0.6444, "lr": 3.3687768052850595e-06, "epoch": 3.8756756756756756, "percentage": 38.76, "elapsed_time": "2:11:56", "remaining_time": "3:28:29"} +{"current_steps": 718, "total_steps": 1850, "loss": 0.4858, "lr": 3.364794749046239e-06, "epoch": 3.881081081081081, "percentage": 38.81, "elapsed_time": "2:11:58", "remaining_time": "3:28:03"} +{"current_steps": 719, "total_steps": 1850, "loss": 0.3103, "lr": 3.3608101989637333e-06, "epoch": 3.8864864864864863, "percentage": 38.86, "elapsed_time": "2:12:03", "remaining_time": "3:27:43"} +{"current_steps": 720, "total_steps": 1850, "loss": 0.2501, "lr": 3.356823166527952e-06, "epoch": 3.891891891891892, "percentage": 38.92, "elapsed_time": "2:12:09", "remaining_time": "3:27:24"} +{"current_steps": 721, "total_steps": 1850, "loss": 0.18, "lr": 3.352833663236463e-06, "epoch": 3.8972972972972975, "percentage": 38.97, "elapsed_time": "2:12:15", "remaining_time": "3:27:05"} +{"current_steps": 722, "total_steps": 1850, "loss": 0.12, "lr": 3.348841700593956e-06, "epoch": 3.902702702702703, "percentage": 39.03, "elapsed_time": "2:12:16", "remaining_time": "3:26:39"} +{"current_steps": 723, "total_steps": 1850, "loss": 0.2618, "lr": 3.3448472901122187e-06, "epoch": 3.908108108108108, "percentage": 39.08, "elapsed_time": "2:12:19", "remaining_time": "3:26:15"} +{"current_steps": 724, "total_steps": 1850, "loss": 0.3689, "lr": 3.340850443310092e-06, "epoch": 3.9135135135135135, "percentage": 39.14, "elapsed_time": "2:12:21", "remaining_time": "3:25:50"} +{"current_steps": 725, "total_steps": 1850, "loss": 0.2195, "lr": 3.336851171713447e-06, "epoch": 3.918918918918919, "percentage": 39.19, "elapsed_time": "2:12:23", "remaining_time": "3:25:26"} +{"current_steps": 726, "total_steps": 1850, "loss": 0.2602, "lr": 3.3328494868551444e-06, "epoch": 3.924324324324324, "percentage": 39.24, "elapsed_time": "2:12:27", "remaining_time": "3:25:04"} +{"current_steps": 727, "total_steps": 1850, "loss": 0.1561, "lr": 3.3288454002750046e-06, "epoch": 3.92972972972973, "percentage": 39.3, "elapsed_time": "2:12:29", "remaining_time": "3:24:40"} +{"current_steps": 728, "total_steps": 1850, "loss": 0.4469, "lr": 3.3248389235197764e-06, "epoch": 3.935135135135135, "percentage": 39.35, "elapsed_time": "2:12:31", "remaining_time": "3:24:15"} +{"current_steps": 729, "total_steps": 1850, "loss": 0.2246, "lr": 3.3208300681430967e-06, "epoch": 3.9405405405405407, "percentage": 39.41, "elapsed_time": "2:12:34", "remaining_time": "3:23:51"} +{"current_steps": 730, "total_steps": 1850, "loss": 0.2743, "lr": 3.3168188457054656e-06, "epoch": 3.945945945945946, "percentage": 39.46, "elapsed_time": "2:12:35", "remaining_time": "3:23:26"} +{"current_steps": 731, "total_steps": 1850, "loss": 0.551, "lr": 3.312805267774209e-06, "epoch": 3.9513513513513514, "percentage": 39.51, "elapsed_time": "2:12:39", "remaining_time": "3:23:03"} +{"current_steps": 732, "total_steps": 1850, "loss": 0.3522, "lr": 3.3087893459234423e-06, "epoch": 3.9567567567567568, "percentage": 39.57, "elapsed_time": "2:12:41", "remaining_time": "3:22:39"} +{"current_steps": 733, "total_steps": 1850, "loss": 0.3084, "lr": 3.304771091734043e-06, "epoch": 3.962162162162162, "percentage": 39.62, "elapsed_time": "2:12:42", "remaining_time": "3:22:14"} +{"current_steps": 734, "total_steps": 1850, "loss": 0.3406, "lr": 3.300750516793614e-06, "epoch": 3.9675675675675675, "percentage": 39.68, "elapsed_time": "2:12:49", "remaining_time": "3:21:56"} +{"current_steps": 735, "total_steps": 1850, "loss": 0.3463, "lr": 3.2967276326964504e-06, "epoch": 3.972972972972973, "percentage": 39.73, "elapsed_time": "2:12:52", "remaining_time": "3:21:34"} +{"current_steps": 736, "total_steps": 1850, "loss": 0.3758, "lr": 3.2927024510435057e-06, "epoch": 3.9783783783783786, "percentage": 39.78, "elapsed_time": "2:12:56", "remaining_time": "3:21:12"} +{"current_steps": 737, "total_steps": 1850, "loss": 0.3328, "lr": 3.2886749834423587e-06, "epoch": 3.983783783783784, "percentage": 39.84, "elapsed_time": "2:12:59", "remaining_time": "3:20:50"} +{"current_steps": 738, "total_steps": 1850, "loss": 0.6213, "lr": 3.284645241507183e-06, "epoch": 3.9891891891891893, "percentage": 39.89, "elapsed_time": "2:13:01", "remaining_time": "3:20:26"} +{"current_steps": 739, "total_steps": 1850, "loss": 0.2463, "lr": 3.280613236858707e-06, "epoch": 3.9945945945945946, "percentage": 39.95, "elapsed_time": "2:13:05", "remaining_time": "3:20:05"} +{"current_steps": 740, "total_steps": 1850, "loss": 0.3501, "lr": 3.2765789811241865e-06, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "2:13:08", "remaining_time": "3:19:42"} +{"current_steps": 741, "total_steps": 1850, "loss": 0.1753, "lr": 3.272542485937369e-06, "epoch": 4.005405405405406, "percentage": 40.05, "elapsed_time": "2:19:14", "remaining_time": "3:28:22"} +{"current_steps": 742, "total_steps": 1850, "loss": 0.0722, "lr": 3.2685037629384587e-06, "epoch": 4.010810810810811, "percentage": 40.11, "elapsed_time": "2:19:14", "remaining_time": "3:27:55"} +{"current_steps": 743, "total_steps": 1850, "loss": 0.2475, "lr": 3.264462823774085e-06, "epoch": 4.0162162162162165, "percentage": 40.16, "elapsed_time": "2:19:19", "remaining_time": "3:27:34"} +{"current_steps": 744, "total_steps": 1850, "loss": 0.1163, "lr": 3.260419680097268e-06, "epoch": 4.021621621621621, "percentage": 40.22, "elapsed_time": "2:19:20", "remaining_time": "3:27:08"} +{"current_steps": 745, "total_steps": 1850, "loss": 0.1325, "lr": 3.2563743435673855e-06, "epoch": 4.027027027027027, "percentage": 40.27, "elapsed_time": "2:19:23", "remaining_time": "3:26:44"} +{"current_steps": 746, "total_steps": 1850, "loss": 0.0466, "lr": 3.252326825850139e-06, "epoch": 4.032432432432432, "percentage": 40.32, "elapsed_time": "2:19:24", "remaining_time": "3:26:19"} +{"current_steps": 747, "total_steps": 1850, "loss": 0.1861, "lr": 3.2482771386175173e-06, "epoch": 4.037837837837838, "percentage": 40.38, "elapsed_time": "2:19:28", "remaining_time": "3:25:57"} +{"current_steps": 748, "total_steps": 1850, "loss": 0.1637, "lr": 3.24422529354777e-06, "epoch": 4.043243243243243, "percentage": 40.43, "elapsed_time": "2:19:31", "remaining_time": "3:25:33"} +{"current_steps": 749, "total_steps": 1850, "loss": 0.1379, "lr": 3.2401713023253646e-06, "epoch": 4.048648648648649, "percentage": 40.49, "elapsed_time": "2:19:36", "remaining_time": "3:25:12"} +{"current_steps": 750, "total_steps": 1850, "loss": 0.1099, "lr": 3.2361151766409628e-06, "epoch": 4.054054054054054, "percentage": 40.54, "elapsed_time": "2:19:37", "remaining_time": "3:24:47"} +{"current_steps": 751, "total_steps": 1850, "loss": 0.1422, "lr": 3.232056928191376e-06, "epoch": 4.059459459459459, "percentage": 40.59, "elapsed_time": "2:19:41", "remaining_time": "3:24:24"} +{"current_steps": 752, "total_steps": 1850, "loss": 0.2716, "lr": 3.2279965686795424e-06, "epoch": 4.064864864864865, "percentage": 40.65, "elapsed_time": "2:19:43", "remaining_time": "3:24:00"} +{"current_steps": 753, "total_steps": 1850, "loss": 0.3849, "lr": 3.2239341098144833e-06, "epoch": 4.07027027027027, "percentage": 40.7, "elapsed_time": "2:19:49", "remaining_time": "3:23:42"} +{"current_steps": 754, "total_steps": 1850, "loss": 0.0768, "lr": 3.219869563311277e-06, "epoch": 4.075675675675676, "percentage": 40.76, "elapsed_time": "2:19:52", "remaining_time": "3:23:19"} +{"current_steps": 755, "total_steps": 1850, "loss": 0.112, "lr": 3.2158029408910213e-06, "epoch": 4.081081081081081, "percentage": 40.81, "elapsed_time": "2:19:54", "remaining_time": "3:22:55"} +{"current_steps": 756, "total_steps": 1850, "loss": 0.1054, "lr": 3.2117342542807995e-06, "epoch": 4.0864864864864865, "percentage": 40.86, "elapsed_time": "2:20:02", "remaining_time": "3:22:38"} +{"current_steps": 757, "total_steps": 1850, "loss": 0.1754, "lr": 3.207663515213648e-06, "epoch": 4.091891891891892, "percentage": 40.92, "elapsed_time": "2:20:06", "remaining_time": "3:22:17"} +{"current_steps": 758, "total_steps": 1850, "loss": 0.191, "lr": 3.2035907354285234e-06, "epoch": 4.097297297297297, "percentage": 40.97, "elapsed_time": "2:20:08", "remaining_time": "3:21:54"} +{"current_steps": 759, "total_steps": 1850, "loss": 0.1083, "lr": 3.1995159266702648e-06, "epoch": 4.102702702702703, "percentage": 41.03, "elapsed_time": "2:20:09", "remaining_time": "3:21:28"} +{"current_steps": 760, "total_steps": 1850, "loss": 0.0609, "lr": 3.1954391006895635e-06, "epoch": 4.108108108108108, "percentage": 41.08, "elapsed_time": "2:20:11", "remaining_time": "3:21:04"} +{"current_steps": 761, "total_steps": 1850, "loss": 0.049, "lr": 3.191360269242928e-06, "epoch": 4.113513513513514, "percentage": 41.14, "elapsed_time": "2:20:13", "remaining_time": "3:20:39"} +{"current_steps": 762, "total_steps": 1850, "loss": 0.1642, "lr": 3.18727944409265e-06, "epoch": 4.118918918918919, "percentage": 41.19, "elapsed_time": "2:20:16", "remaining_time": "3:20:17"} +{"current_steps": 763, "total_steps": 1850, "loss": 0.1513, "lr": 3.1831966370067714e-06, "epoch": 4.124324324324324, "percentage": 41.24, "elapsed_time": "2:20:21", "remaining_time": "3:19:57"} +{"current_steps": 764, "total_steps": 1850, "loss": 0.3276, "lr": 3.1791118597590467e-06, "epoch": 4.12972972972973, "percentage": 41.3, "elapsed_time": "2:20:27", "remaining_time": "3:19:39"} +{"current_steps": 765, "total_steps": 1850, "loss": 0.4011, "lr": 3.1750251241289148e-06, "epoch": 4.135135135135135, "percentage": 41.35, "elapsed_time": "2:20:33", "remaining_time": "3:19:21"} +{"current_steps": 766, "total_steps": 1850, "loss": 0.2274, "lr": 3.1709364419014615e-06, "epoch": 4.140540540540541, "percentage": 41.41, "elapsed_time": "2:20:35", "remaining_time": "3:18:57"} +{"current_steps": 767, "total_steps": 1850, "loss": 0.118, "lr": 3.166845824867384e-06, "epoch": 4.145945945945946, "percentage": 41.46, "elapsed_time": "2:20:38", "remaining_time": "3:18:34"} +{"current_steps": 768, "total_steps": 1850, "loss": 0.1109, "lr": 3.162753284822962e-06, "epoch": 4.151351351351352, "percentage": 41.51, "elapsed_time": "2:20:40", "remaining_time": "3:18:12"} +{"current_steps": 769, "total_steps": 1850, "loss": 0.1754, "lr": 3.1586588335700176e-06, "epoch": 4.1567567567567565, "percentage": 41.57, "elapsed_time": "2:20:42", "remaining_time": "3:17:47"} +{"current_steps": 770, "total_steps": 1850, "loss": 0.1155, "lr": 3.1545624829158873e-06, "epoch": 4.162162162162162, "percentage": 41.62, "elapsed_time": "2:20:46", "remaining_time": "3:17:26"} +{"current_steps": 771, "total_steps": 1850, "loss": 0.0635, "lr": 3.1504642446733828e-06, "epoch": 4.167567567567567, "percentage": 41.68, "elapsed_time": "2:20:49", "remaining_time": "3:17:05"} +{"current_steps": 772, "total_steps": 1850, "loss": 0.1068, "lr": 3.146364130660761e-06, "epoch": 4.172972972972973, "percentage": 41.73, "elapsed_time": "2:20:54", "remaining_time": "3:16:45"} +{"current_steps": 773, "total_steps": 1850, "loss": 0.0637, "lr": 3.142262152701685e-06, "epoch": 4.178378378378379, "percentage": 41.78, "elapsed_time": "2:20:56", "remaining_time": "3:16:22"} +{"current_steps": 774, "total_steps": 1850, "loss": 0.2703, "lr": 3.138158322625197e-06, "epoch": 4.183783783783784, "percentage": 41.84, "elapsed_time": "2:20:59", "remaining_time": "3:15:59"} +{"current_steps": 775, "total_steps": 1850, "loss": 0.2769, "lr": 3.1340526522656765e-06, "epoch": 4.1891891891891895, "percentage": 41.89, "elapsed_time": "2:21:01", "remaining_time": "3:15:37"} +{"current_steps": 776, "total_steps": 1850, "loss": 0.1192, "lr": 3.1299451534628134e-06, "epoch": 4.194594594594594, "percentage": 41.95, "elapsed_time": "2:21:04", "remaining_time": "3:15:15"} +{"current_steps": 777, "total_steps": 1850, "loss": 0.1244, "lr": 3.1258358380615674e-06, "epoch": 4.2, "percentage": 42.0, "elapsed_time": "2:21:10", "remaining_time": "3:14:56"} +{"current_steps": 778, "total_steps": 1850, "loss": 0.2819, "lr": 3.121724717912138e-06, "epoch": 4.205405405405405, "percentage": 42.05, "elapsed_time": "2:21:13", "remaining_time": "3:14:34"} +{"current_steps": 779, "total_steps": 1850, "loss": 0.1018, "lr": 3.1176118048699283e-06, "epoch": 4.210810810810811, "percentage": 42.11, "elapsed_time": "2:21:16", "remaining_time": "3:14:13"} +{"current_steps": 780, "total_steps": 1850, "loss": 0.1842, "lr": 3.113497110795514e-06, "epoch": 4.216216216216216, "percentage": 42.16, "elapsed_time": "2:21:22", "remaining_time": "3:13:56"} +{"current_steps": 781, "total_steps": 1850, "loss": 0.2299, "lr": 3.1093806475546046e-06, "epoch": 4.221621621621622, "percentage": 42.22, "elapsed_time": "2:21:29", "remaining_time": "3:13:40"} +{"current_steps": 782, "total_steps": 1850, "loss": 0.1397, "lr": 3.1052624270180116e-06, "epoch": 4.227027027027027, "percentage": 42.27, "elapsed_time": "2:21:36", "remaining_time": "3:13:23"} +{"current_steps": 783, "total_steps": 1850, "loss": 0.2236, "lr": 3.1011424610616153e-06, "epoch": 4.232432432432432, "percentage": 42.32, "elapsed_time": "2:21:38", "remaining_time": "3:13:00"} +{"current_steps": 784, "total_steps": 1850, "loss": 0.1417, "lr": 3.097020761566328e-06, "epoch": 4.237837837837838, "percentage": 42.38, "elapsed_time": "2:21:41", "remaining_time": "3:12:39"} +{"current_steps": 785, "total_steps": 1850, "loss": 0.1317, "lr": 3.092897340418062e-06, "epoch": 4.243243243243243, "percentage": 42.43, "elapsed_time": "2:21:42", "remaining_time": "3:12:15"} +{"current_steps": 786, "total_steps": 1850, "loss": 0.1869, "lr": 3.088772209507694e-06, "epoch": 4.248648648648649, "percentage": 42.49, "elapsed_time": "2:21:43", "remaining_time": "3:11:51"} +{"current_steps": 787, "total_steps": 1850, "loss": 0.0967, "lr": 3.0846453807310317e-06, "epoch": 4.254054054054054, "percentage": 42.54, "elapsed_time": "2:21:45", "remaining_time": "3:11:28"} +{"current_steps": 788, "total_steps": 1850, "loss": 0.0731, "lr": 3.080516865988778e-06, "epoch": 4.2594594594594595, "percentage": 42.59, "elapsed_time": "2:21:48", "remaining_time": "3:11:07"} +{"current_steps": 789, "total_steps": 1850, "loss": 0.1912, "lr": 3.076386677186498e-06, "epoch": 4.264864864864865, "percentage": 42.65, "elapsed_time": "2:21:50", "remaining_time": "3:10:44"} +{"current_steps": 790, "total_steps": 1850, "loss": 0.2133, "lr": 3.0722548262345854e-06, "epoch": 4.27027027027027, "percentage": 42.7, "elapsed_time": "2:21:54", "remaining_time": "3:10:25"} +{"current_steps": 791, "total_steps": 1850, "loss": 0.4454, "lr": 3.0681213250482255e-06, "epoch": 4.275675675675676, "percentage": 42.76, "elapsed_time": "2:21:56", "remaining_time": "3:10:01"} +{"current_steps": 792, "total_steps": 1850, "loss": 0.3645, "lr": 3.0639861855473637e-06, "epoch": 4.281081081081081, "percentage": 42.81, "elapsed_time": "2:21:58", "remaining_time": "3:09:39"} +{"current_steps": 793, "total_steps": 1850, "loss": 0.1331, "lr": 3.05984941965667e-06, "epoch": 4.286486486486487, "percentage": 42.86, "elapsed_time": "2:21:59", "remaining_time": "3:09:15"} +{"current_steps": 794, "total_steps": 1850, "loss": 0.0863, "lr": 3.055711039305503e-06, "epoch": 4.291891891891892, "percentage": 42.92, "elapsed_time": "2:22:03", "remaining_time": "3:08:55"} +{"current_steps": 795, "total_steps": 1850, "loss": 0.1988, "lr": 3.051571056427879e-06, "epoch": 4.297297297297297, "percentage": 42.97, "elapsed_time": "2:22:06", "remaining_time": "3:08:35"} +{"current_steps": 796, "total_steps": 1850, "loss": 0.2307, "lr": 3.047429482962433e-06, "epoch": 4.302702702702703, "percentage": 43.03, "elapsed_time": "2:22:07", "remaining_time": "3:08:11"} +{"current_steps": 797, "total_steps": 1850, "loss": 0.1614, "lr": 3.0432863308523903e-06, "epoch": 4.308108108108108, "percentage": 43.08, "elapsed_time": "2:22:09", "remaining_time": "3:07:49"} +{"current_steps": 798, "total_steps": 1850, "loss": 0.0683, "lr": 3.039141612045525e-06, "epoch": 4.313513513513514, "percentage": 43.14, "elapsed_time": "2:22:12", "remaining_time": "3:07:27"} +{"current_steps": 799, "total_steps": 1850, "loss": 0.1784, "lr": 3.034995338494131e-06, "epoch": 4.318918918918919, "percentage": 43.19, "elapsed_time": "2:22:15", "remaining_time": "3:07:06"} +{"current_steps": 800, "total_steps": 1850, "loss": 0.0451, "lr": 3.0308475221549868e-06, "epoch": 4.324324324324325, "percentage": 43.24, "elapsed_time": "2:22:16", "remaining_time": "3:06:44"} +{"current_steps": 801, "total_steps": 1850, "loss": 0.0618, "lr": 3.026698174989316e-06, "epoch": 4.3297297297297295, "percentage": 43.3, "elapsed_time": "2:22:20", "remaining_time": "3:06:24"} +{"current_steps": 802, "total_steps": 1850, "loss": 0.1529, "lr": 3.0225473089627617e-06, "epoch": 4.335135135135135, "percentage": 43.35, "elapsed_time": "2:22:25", "remaining_time": "3:06:06"} +{"current_steps": 803, "total_steps": 1850, "loss": 0.4177, "lr": 3.0183949360453442e-06, "epoch": 4.34054054054054, "percentage": 43.41, "elapsed_time": "2:22:30", "remaining_time": "3:05:48"} +{"current_steps": 804, "total_steps": 1850, "loss": 0.1394, "lr": 3.014241068211428e-06, "epoch": 4.345945945945946, "percentage": 43.46, "elapsed_time": "2:22:32", "remaining_time": "3:05:27"} +{"current_steps": 805, "total_steps": 1850, "loss": 0.04, "lr": 3.0100857174396926e-06, "epoch": 4.351351351351352, "percentage": 43.51, "elapsed_time": "2:22:34", "remaining_time": "3:05:04"} +{"current_steps": 806, "total_steps": 1850, "loss": 0.2705, "lr": 3.0059288957130893e-06, "epoch": 4.356756756756757, "percentage": 43.57, "elapsed_time": "2:22:37", "remaining_time": "3:04:44"} +{"current_steps": 807, "total_steps": 1850, "loss": 0.2208, "lr": 3.001770615018815e-06, "epoch": 4.3621621621621625, "percentage": 43.62, "elapsed_time": "2:22:42", "remaining_time": "3:04:26"} +{"current_steps": 808, "total_steps": 1850, "loss": 0.2068, "lr": 2.9976108873482725e-06, "epoch": 4.367567567567567, "percentage": 43.68, "elapsed_time": "2:22:45", "remaining_time": "3:04:06"} +{"current_steps": 809, "total_steps": 1850, "loss": 0.1253, "lr": 2.9934497246970357e-06, "epoch": 4.372972972972973, "percentage": 43.73, "elapsed_time": "2:22:47", "remaining_time": "3:03:44"} +{"current_steps": 810, "total_steps": 1850, "loss": 0.1721, "lr": 2.989287139064819e-06, "epoch": 4.378378378378378, "percentage": 43.78, "elapsed_time": "2:22:49", "remaining_time": "3:03:23"} +{"current_steps": 811, "total_steps": 1850, "loss": 0.134, "lr": 2.9851231424554385e-06, "epoch": 4.383783783783784, "percentage": 43.84, "elapsed_time": "2:22:51", "remaining_time": "3:03:01"} +{"current_steps": 812, "total_steps": 1850, "loss": 0.0818, "lr": 2.9809577468767813e-06, "epoch": 4.389189189189189, "percentage": 43.89, "elapsed_time": "2:22:53", "remaining_time": "3:02:39"} +{"current_steps": 813, "total_steps": 1850, "loss": 0.1797, "lr": 2.9767909643407676e-06, "epoch": 4.394594594594595, "percentage": 43.95, "elapsed_time": "2:22:56", "remaining_time": "3:02:19"} +{"current_steps": 814, "total_steps": 1850, "loss": 0.145, "lr": 2.9726228068633155e-06, "epoch": 4.4, "percentage": 44.0, "elapsed_time": "2:23:00", "remaining_time": "3:02:00"} +{"current_steps": 815, "total_steps": 1850, "loss": 0.079, "lr": 2.9684532864643123e-06, "epoch": 4.405405405405405, "percentage": 44.05, "elapsed_time": "2:23:03", "remaining_time": "3:01:40"} +{"current_steps": 816, "total_steps": 1850, "loss": 0.1763, "lr": 2.9642824151675702e-06, "epoch": 4.410810810810811, "percentage": 44.11, "elapsed_time": "2:23:05", "remaining_time": "3:01:19"} +{"current_steps": 817, "total_steps": 1850, "loss": 0.2654, "lr": 2.9601102050008016e-06, "epoch": 4.416216216216216, "percentage": 44.16, "elapsed_time": "2:23:09", "remaining_time": "3:01:00"} +{"current_steps": 818, "total_steps": 1850, "loss": 0.0779, "lr": 2.955936667995578e-06, "epoch": 4.421621621621622, "percentage": 44.22, "elapsed_time": "2:23:11", "remaining_time": "3:00:39"} +{"current_steps": 819, "total_steps": 1850, "loss": 0.0587, "lr": 2.9517618161872974e-06, "epoch": 4.427027027027027, "percentage": 44.27, "elapsed_time": "2:23:15", "remaining_time": "3:00:20"} +{"current_steps": 820, "total_steps": 1850, "loss": 0.0835, "lr": 2.9475856616151487e-06, "epoch": 4.4324324324324325, "percentage": 44.32, "elapsed_time": "2:23:19", "remaining_time": "3:00:01"} +{"current_steps": 821, "total_steps": 1850, "loss": 0.1748, "lr": 2.9434082163220773e-06, "epoch": 4.437837837837838, "percentage": 44.38, "elapsed_time": "2:23:22", "remaining_time": "2:59:41"} +{"current_steps": 822, "total_steps": 1850, "loss": 0.119, "lr": 2.9392294923547543e-06, "epoch": 4.443243243243243, "percentage": 44.43, "elapsed_time": "2:23:24", "remaining_time": "2:59:21"} +{"current_steps": 823, "total_steps": 1850, "loss": 0.1535, "lr": 2.9350495017635334e-06, "epoch": 4.448648648648649, "percentage": 44.49, "elapsed_time": "2:23:26", "remaining_time": "2:58:59"} +{"current_steps": 824, "total_steps": 1850, "loss": 0.2561, "lr": 2.9308682566024228e-06, "epoch": 4.454054054054054, "percentage": 44.54, "elapsed_time": "2:23:31", "remaining_time": "2:58:42"} +{"current_steps": 825, "total_steps": 1850, "loss": 0.2024, "lr": 2.92668576892905e-06, "epoch": 4.45945945945946, "percentage": 44.59, "elapsed_time": "2:23:35", "remaining_time": "2:58:24"} +{"current_steps": 826, "total_steps": 1850, "loss": 0.0436, "lr": 2.9225020508046233e-06, "epoch": 4.464864864864865, "percentage": 44.65, "elapsed_time": "2:23:37", "remaining_time": "2:58:02"} +{"current_steps": 827, "total_steps": 1850, "loss": 0.1636, "lr": 2.9183171142939002e-06, "epoch": 4.47027027027027, "percentage": 44.7, "elapsed_time": "2:23:39", "remaining_time": "2:57:42"} +{"current_steps": 828, "total_steps": 1850, "loss": 0.0962, "lr": 2.9141309714651528e-06, "epoch": 4.475675675675676, "percentage": 44.76, "elapsed_time": "2:23:42", "remaining_time": "2:57:22"} +{"current_steps": 829, "total_steps": 1850, "loss": 0.2129, "lr": 2.9099436343901306e-06, "epoch": 4.481081081081081, "percentage": 44.81, "elapsed_time": "2:23:47", "remaining_time": "2:57:05"} +{"current_steps": 830, "total_steps": 1850, "loss": 0.2872, "lr": 2.9057551151440266e-06, "epoch": 4.486486486486487, "percentage": 44.86, "elapsed_time": "2:23:49", "remaining_time": "2:56:44"} +{"current_steps": 831, "total_steps": 1850, "loss": 0.3254, "lr": 2.9015654258054433e-06, "epoch": 4.491891891891892, "percentage": 44.92, "elapsed_time": "2:23:55", "remaining_time": "2:56:29"} +{"current_steps": 832, "total_steps": 1850, "loss": 0.1417, "lr": 2.8973745784563596e-06, "epoch": 4.4972972972972975, "percentage": 44.97, "elapsed_time": "2:23:58", "remaining_time": "2:56:09"} +{"current_steps": 833, "total_steps": 1850, "loss": 0.2513, "lr": 2.8931825851820904e-06, "epoch": 4.5027027027027025, "percentage": 45.03, "elapsed_time": "2:24:01", "remaining_time": "2:55:50"} +{"current_steps": 834, "total_steps": 1850, "loss": 0.1785, "lr": 2.8889894580712574e-06, "epoch": 4.508108108108108, "percentage": 45.08, "elapsed_time": "2:24:04", "remaining_time": "2:55:31"} +{"current_steps": 835, "total_steps": 1850, "loss": 0.2853, "lr": 2.884795209215751e-06, "epoch": 4.513513513513513, "percentage": 45.14, "elapsed_time": "2:24:08", "remaining_time": "2:55:12"} +{"current_steps": 836, "total_steps": 1850, "loss": 0.2947, "lr": 2.880599850710696e-06, "epoch": 4.518918918918919, "percentage": 45.19, "elapsed_time": "2:24:11", "remaining_time": "2:54:53"} +{"current_steps": 837, "total_steps": 1850, "loss": 0.177, "lr": 2.8764033946544197e-06, "epoch": 4.524324324324324, "percentage": 45.24, "elapsed_time": "2:24:17", "remaining_time": "2:54:37"} +{"current_steps": 838, "total_steps": 1850, "loss": 0.2786, "lr": 2.8722058531484105e-06, "epoch": 4.52972972972973, "percentage": 45.3, "elapsed_time": "2:24:18", "remaining_time": "2:54:16"} +{"current_steps": 839, "total_steps": 1850, "loss": 0.1881, "lr": 2.86800723829729e-06, "epoch": 4.535135135135135, "percentage": 45.35, "elapsed_time": "2:24:20", "remaining_time": "2:53:56"} +{"current_steps": 840, "total_steps": 1850, "loss": 0.3541, "lr": 2.8638075622087747e-06, "epoch": 4.54054054054054, "percentage": 45.41, "elapsed_time": "2:24:23", "remaining_time": "2:53:36"} +{"current_steps": 841, "total_steps": 1850, "loss": 0.3094, "lr": 2.8596068369936386e-06, "epoch": 4.545945945945946, "percentage": 45.46, "elapsed_time": "2:24:26", "remaining_time": "2:53:18"} +{"current_steps": 842, "total_steps": 1850, "loss": 0.1162, "lr": 2.8554050747656862e-06, "epoch": 4.551351351351351, "percentage": 45.51, "elapsed_time": "2:24:29", "remaining_time": "2:52:59"} +{"current_steps": 843, "total_steps": 1850, "loss": 0.1079, "lr": 2.851202287641709e-06, "epoch": 4.556756756756757, "percentage": 45.57, "elapsed_time": "2:24:32", "remaining_time": "2:52:39"} +{"current_steps": 844, "total_steps": 1850, "loss": 0.4462, "lr": 2.8469984877414525e-06, "epoch": 4.562162162162162, "percentage": 45.62, "elapsed_time": "2:24:35", "remaining_time": "2:52:20"} +{"current_steps": 845, "total_steps": 1850, "loss": 0.0851, "lr": 2.842793687187588e-06, "epoch": 4.5675675675675675, "percentage": 45.68, "elapsed_time": "2:24:37", "remaining_time": "2:52:00"} +{"current_steps": 846, "total_steps": 1850, "loss": 0.1268, "lr": 2.8385878981056663e-06, "epoch": 4.572972972972973, "percentage": 45.73, "elapsed_time": "2:24:39", "remaining_time": "2:51:40"} +{"current_steps": 847, "total_steps": 1850, "loss": 0.3187, "lr": 2.8343811326240944e-06, "epoch": 4.578378378378378, "percentage": 45.78, "elapsed_time": "2:24:44", "remaining_time": "2:51:23"} +{"current_steps": 848, "total_steps": 1850, "loss": 0.1315, "lr": 2.830173402874091e-06, "epoch": 4.583783783783784, "percentage": 45.84, "elapsed_time": "2:24:51", "remaining_time": "2:51:10"} +{"current_steps": 849, "total_steps": 1850, "loss": 0.301, "lr": 2.8259647209896573e-06, "epoch": 4.589189189189189, "percentage": 45.89, "elapsed_time": "2:24:54", "remaining_time": "2:50:51"} +{"current_steps": 850, "total_steps": 1850, "loss": 0.1478, "lr": 2.821755099107541e-06, "epoch": 4.594594594594595, "percentage": 45.95, "elapsed_time": "2:24:56", "remaining_time": "2:50:31"} +{"current_steps": 851, "total_steps": 1850, "loss": 0.2029, "lr": 2.817544549367197e-06, "epoch": 4.6, "percentage": 46.0, "elapsed_time": "2:25:00", "remaining_time": "2:50:13"} +{"current_steps": 852, "total_steps": 1850, "loss": 0.0549, "lr": 2.813333083910761e-06, "epoch": 4.605405405405405, "percentage": 46.05, "elapsed_time": "2:25:03", "remaining_time": "2:49:55"} +{"current_steps": 853, "total_steps": 1850, "loss": 0.1508, "lr": 2.8091207148830046e-06, "epoch": 4.610810810810811, "percentage": 46.11, "elapsed_time": "2:25:08", "remaining_time": "2:49:38"} +{"current_steps": 854, "total_steps": 1850, "loss": 0.1094, "lr": 2.8049074544313094e-06, "epoch": 4.616216216216216, "percentage": 46.16, "elapsed_time": "2:25:11", "remaining_time": "2:49:19"} +{"current_steps": 855, "total_steps": 1850, "loss": 0.0799, "lr": 2.8006933147056236e-06, "epoch": 4.621621621621622, "percentage": 46.22, "elapsed_time": "2:25:14", "remaining_time": "2:49:01"} +{"current_steps": 856, "total_steps": 1850, "loss": 0.123, "lr": 2.7964783078584336e-06, "epoch": 4.627027027027027, "percentage": 46.27, "elapsed_time": "2:25:17", "remaining_time": "2:48:42"} +{"current_steps": 857, "total_steps": 1850, "loss": 0.0692, "lr": 2.792262446044725e-06, "epoch": 4.632432432432433, "percentage": 46.32, "elapsed_time": "2:25:21", "remaining_time": "2:48:25"} +{"current_steps": 858, "total_steps": 1850, "loss": 0.1596, "lr": 2.788045741421949e-06, "epoch": 4.6378378378378375, "percentage": 46.38, "elapsed_time": "2:25:26", "remaining_time": "2:48:09"} +{"current_steps": 859, "total_steps": 1850, "loss": 0.047, "lr": 2.78382820614999e-06, "epoch": 4.643243243243243, "percentage": 46.43, "elapsed_time": "2:25:28", "remaining_time": "2:47:49"} +{"current_steps": 860, "total_steps": 1850, "loss": 0.1561, "lr": 2.779609852391123e-06, "epoch": 4.648648648648649, "percentage": 46.49, "elapsed_time": "2:25:31", "remaining_time": "2:47:31"} +{"current_steps": 861, "total_steps": 1850, "loss": 0.2157, "lr": 2.775390692309987e-06, "epoch": 4.654054054054054, "percentage": 46.54, "elapsed_time": "2:25:35", "remaining_time": "2:47:13"} +{"current_steps": 862, "total_steps": 1850, "loss": 0.0782, "lr": 2.7711707380735443e-06, "epoch": 4.65945945945946, "percentage": 46.59, "elapsed_time": "2:25:37", "remaining_time": "2:46:54"} +{"current_steps": 863, "total_steps": 1850, "loss": 0.2994, "lr": 2.766950001851049e-06, "epoch": 4.664864864864865, "percentage": 46.65, "elapsed_time": "2:25:43", "remaining_time": "2:46:39"} +{"current_steps": 864, "total_steps": 1850, "loss": 0.109, "lr": 2.7627284958140084e-06, "epoch": 4.6702702702702705, "percentage": 46.7, "elapsed_time": "2:25:48", "remaining_time": "2:46:24"} +{"current_steps": 865, "total_steps": 1850, "loss": 0.2557, "lr": 2.7585062321361517e-06, "epoch": 4.675675675675675, "percentage": 46.76, "elapsed_time": "2:25:52", "remaining_time": "2:46:06"} +{"current_steps": 866, "total_steps": 1850, "loss": 0.0413, "lr": 2.75428322299339e-06, "epoch": 4.681081081081081, "percentage": 46.81, "elapsed_time": "2:25:53", "remaining_time": "2:45:45"} +{"current_steps": 867, "total_steps": 1850, "loss": 0.0402, "lr": 2.7500594805637882e-06, "epoch": 4.686486486486486, "percentage": 46.86, "elapsed_time": "2:25:54", "remaining_time": "2:45:25"} +{"current_steps": 868, "total_steps": 1850, "loss": 0.1481, "lr": 2.745835017027522e-06, "epoch": 4.691891891891892, "percentage": 46.92, "elapsed_time": "2:26:00", "remaining_time": "2:45:11"} +{"current_steps": 869, "total_steps": 1850, "loss": 0.2242, "lr": 2.74160984456685e-06, "epoch": 4.697297297297297, "percentage": 46.97, "elapsed_time": "2:26:03", "remaining_time": "2:44:53"} +{"current_steps": 870, "total_steps": 1850, "loss": 0.4693, "lr": 2.737383975366071e-06, "epoch": 4.702702702702703, "percentage": 47.03, "elapsed_time": "2:26:05", "remaining_time": "2:44:33"} +{"current_steps": 871, "total_steps": 1850, "loss": 0.1353, "lr": 2.7331574216114963e-06, "epoch": 4.708108108108108, "percentage": 47.08, "elapsed_time": "2:26:11", "remaining_time": "2:44:19"} +{"current_steps": 872, "total_steps": 1850, "loss": 0.157, "lr": 2.728930195491411e-06, "epoch": 4.713513513513513, "percentage": 47.14, "elapsed_time": "2:26:15", "remaining_time": "2:44:01"} +{"current_steps": 873, "total_steps": 1850, "loss": 0.1863, "lr": 2.724702309196038e-06, "epoch": 4.718918918918919, "percentage": 47.19, "elapsed_time": "2:26:17", "remaining_time": "2:43:42"} +{"current_steps": 874, "total_steps": 1850, "loss": 0.2874, "lr": 2.720473774917505e-06, "epoch": 4.724324324324324, "percentage": 47.24, "elapsed_time": "2:26:21", "remaining_time": "2:43:26"} +{"current_steps": 875, "total_steps": 1850, "loss": 0.1021, "lr": 2.716244604849807e-06, "epoch": 4.72972972972973, "percentage": 47.3, "elapsed_time": "2:26:23", "remaining_time": "2:43:07"} +{"current_steps": 876, "total_steps": 1850, "loss": 0.1046, "lr": 2.7120148111887732e-06, "epoch": 4.735135135135135, "percentage": 47.35, "elapsed_time": "2:26:27", "remaining_time": "2:42:51"} +{"current_steps": 877, "total_steps": 1850, "loss": 0.0971, "lr": 2.707784406132032e-06, "epoch": 4.7405405405405405, "percentage": 47.41, "elapsed_time": "2:26:29", "remaining_time": "2:42:31"} +{"current_steps": 878, "total_steps": 1850, "loss": 0.0507, "lr": 2.703553401878972e-06, "epoch": 4.745945945945946, "percentage": 47.46, "elapsed_time": "2:26:31", "remaining_time": "2:42:12"} +{"current_steps": 879, "total_steps": 1850, "loss": 0.0616, "lr": 2.6993218106307146e-06, "epoch": 4.751351351351351, "percentage": 47.51, "elapsed_time": "2:26:33", "remaining_time": "2:41:54"} +{"current_steps": 880, "total_steps": 1850, "loss": 0.0908, "lr": 2.6950896445900685e-06, "epoch": 4.756756756756757, "percentage": 47.57, "elapsed_time": "2:26:35", "remaining_time": "2:41:34"} +{"current_steps": 881, "total_steps": 1850, "loss": 0.2426, "lr": 2.690856915961504e-06, "epoch": 4.762162162162162, "percentage": 47.62, "elapsed_time": "2:26:37", "remaining_time": "2:41:16"} +{"current_steps": 882, "total_steps": 1850, "loss": 0.1881, "lr": 2.686623636951112e-06, "epoch": 4.767567567567568, "percentage": 47.68, "elapsed_time": "2:26:40", "remaining_time": "2:40:58"} +{"current_steps": 883, "total_steps": 1850, "loss": 0.1385, "lr": 2.6823898197665703e-06, "epoch": 4.772972972972973, "percentage": 47.73, "elapsed_time": "2:26:42", "remaining_time": "2:40:39"} +{"current_steps": 884, "total_steps": 1850, "loss": 0.2913, "lr": 2.6781554766171104e-06, "epoch": 4.778378378378378, "percentage": 47.78, "elapsed_time": "2:26:44", "remaining_time": "2:40:21"} +{"current_steps": 885, "total_steps": 1850, "loss": 0.0874, "lr": 2.673920619713478e-06, "epoch": 4.783783783783784, "percentage": 47.84, "elapsed_time": "2:26:51", "remaining_time": "2:40:07"} +{"current_steps": 886, "total_steps": 1850, "loss": 0.2703, "lr": 2.6696852612679024e-06, "epoch": 4.789189189189189, "percentage": 47.89, "elapsed_time": "2:26:54", "remaining_time": "2:39:50"} +{"current_steps": 887, "total_steps": 1850, "loss": 0.121, "lr": 2.6654494134940586e-06, "epoch": 4.794594594594595, "percentage": 47.95, "elapsed_time": "2:26:59", "remaining_time": "2:39:35"} +{"current_steps": 888, "total_steps": 1850, "loss": 0.1853, "lr": 2.6612130886070313e-06, "epoch": 4.8, "percentage": 48.0, "elapsed_time": "2:27:02", "remaining_time": "2:39:17"} +{"current_steps": 889, "total_steps": 1850, "loss": 0.0533, "lr": 2.6569762988232838e-06, "epoch": 4.805405405405406, "percentage": 48.05, "elapsed_time": "2:27:05", "remaining_time": "2:39:00"} +{"current_steps": 890, "total_steps": 1850, "loss": 0.3178, "lr": 2.652739056360618e-06, "epoch": 4.8108108108108105, "percentage": 48.11, "elapsed_time": "2:27:08", "remaining_time": "2:38:42"} +{"current_steps": 891, "total_steps": 1850, "loss": 0.1735, "lr": 2.648501373438142e-06, "epoch": 4.816216216216216, "percentage": 48.16, "elapsed_time": "2:27:10", "remaining_time": "2:38:23"} +{"current_steps": 892, "total_steps": 1850, "loss": 0.062, "lr": 2.644263262276234e-06, "epoch": 4.821621621621622, "percentage": 48.22, "elapsed_time": "2:27:13", "remaining_time": "2:38:07"} +{"current_steps": 893, "total_steps": 1850, "loss": 0.1336, "lr": 2.640024735096507e-06, "epoch": 4.827027027027027, "percentage": 48.27, "elapsed_time": "2:27:14", "remaining_time": "2:37:47"} +{"current_steps": 894, "total_steps": 1850, "loss": 0.1404, "lr": 2.6357858041217733e-06, "epoch": 4.832432432432433, "percentage": 48.32, "elapsed_time": "2:27:19", "remaining_time": "2:37:32"} +{"current_steps": 895, "total_steps": 1850, "loss": 0.0373, "lr": 2.6315464815760104e-06, "epoch": 4.837837837837838, "percentage": 48.38, "elapsed_time": "2:27:20", "remaining_time": "2:37:13"} +{"current_steps": 896, "total_steps": 1850, "loss": 0.3068, "lr": 2.6273067796843242e-06, "epoch": 4.8432432432432435, "percentage": 48.43, "elapsed_time": "2:27:23", "remaining_time": "2:36:55"} +{"current_steps": 897, "total_steps": 1850, "loss": 0.2221, "lr": 2.6230667106729157e-06, "epoch": 4.848648648648648, "percentage": 48.49, "elapsed_time": "2:27:26", "remaining_time": "2:36:38"} +{"current_steps": 898, "total_steps": 1850, "loss": 0.1431, "lr": 2.618826286769043e-06, "epoch": 4.854054054054054, "percentage": 48.54, "elapsed_time": "2:27:33", "remaining_time": "2:36:26"} +{"current_steps": 899, "total_steps": 1850, "loss": 0.196, "lr": 2.614585520200989e-06, "epoch": 4.859459459459459, "percentage": 48.59, "elapsed_time": "2:27:36", "remaining_time": "2:36:08"} +{"current_steps": 900, "total_steps": 1850, "loss": 0.2509, "lr": 2.6103444231980233e-06, "epoch": 4.864864864864865, "percentage": 48.65, "elapsed_time": "2:27:37", "remaining_time": "2:35:49"} +{"current_steps": 901, "total_steps": 1850, "loss": 0.0747, "lr": 2.606103007990371e-06, "epoch": 4.87027027027027, "percentage": 48.7, "elapsed_time": "2:27:40", "remaining_time": "2:35:33"} +{"current_steps": 902, "total_steps": 1850, "loss": 0.0494, "lr": 2.601861286809172e-06, "epoch": 4.875675675675676, "percentage": 48.76, "elapsed_time": "2:27:41", "remaining_time": "2:35:13"} +{"current_steps": 903, "total_steps": 1850, "loss": 0.0901, "lr": 2.5976192718864497e-06, "epoch": 4.881081081081081, "percentage": 48.81, "elapsed_time": "2:27:44", "remaining_time": "2:34:56"} +{"current_steps": 904, "total_steps": 1850, "loss": 0.0465, "lr": 2.593376975455075e-06, "epoch": 4.886486486486486, "percentage": 48.86, "elapsed_time": "2:27:45", "remaining_time": "2:34:37"} +{"current_steps": 905, "total_steps": 1850, "loss": 0.0616, "lr": 2.5891344097487294e-06, "epoch": 4.891891891891892, "percentage": 48.92, "elapsed_time": "2:27:46", "remaining_time": "2:34:18"} +{"current_steps": 906, "total_steps": 1850, "loss": 0.087, "lr": 2.584891587001872e-06, "epoch": 4.897297297297297, "percentage": 48.97, "elapsed_time": "2:27:50", "remaining_time": "2:34:02"} +{"current_steps": 907, "total_steps": 1850, "loss": 0.053, "lr": 2.580648519449704e-06, "epoch": 4.902702702702703, "percentage": 49.03, "elapsed_time": "2:27:52", "remaining_time": "2:33:44"} +{"current_steps": 908, "total_steps": 1850, "loss": 0.2707, "lr": 2.5764052193281287e-06, "epoch": 4.908108108108108, "percentage": 49.08, "elapsed_time": "2:27:55", "remaining_time": "2:33:27"} +{"current_steps": 909, "total_steps": 1850, "loss": 0.3679, "lr": 2.5721616988737254e-06, "epoch": 4.9135135135135135, "percentage": 49.14, "elapsed_time": "2:27:58", "remaining_time": "2:33:10"} +{"current_steps": 910, "total_steps": 1850, "loss": 0.1929, "lr": 2.567917970323704e-06, "epoch": 4.918918918918919, "percentage": 49.19, "elapsed_time": "2:28:00", "remaining_time": "2:32:53"} +{"current_steps": 911, "total_steps": 1850, "loss": 0.2461, "lr": 2.5636740459158776e-06, "epoch": 4.924324324324324, "percentage": 49.24, "elapsed_time": "2:28:04", "remaining_time": "2:32:37"} +{"current_steps": 912, "total_steps": 1850, "loss": 0.2484, "lr": 2.559429937888624e-06, "epoch": 4.92972972972973, "percentage": 49.3, "elapsed_time": "2:28:06", "remaining_time": "2:32:19"} +{"current_steps": 913, "total_steps": 1850, "loss": 0.1886, "lr": 2.5551856584808483e-06, "epoch": 4.935135135135135, "percentage": 49.35, "elapsed_time": "2:28:08", "remaining_time": "2:32:02"} +{"current_steps": 914, "total_steps": 1850, "loss": 0.1789, "lr": 2.5509412199319515e-06, "epoch": 4.940540540540541, "percentage": 49.41, "elapsed_time": "2:28:10", "remaining_time": "2:31:44"} +{"current_steps": 915, "total_steps": 1850, "loss": 0.1072, "lr": 2.5466966344817927e-06, "epoch": 4.945945945945946, "percentage": 49.46, "elapsed_time": "2:28:14", "remaining_time": "2:31:28"} +{"current_steps": 916, "total_steps": 1850, "loss": 0.2624, "lr": 2.542451914370656e-06, "epoch": 4.951351351351351, "percentage": 49.51, "elapsed_time": "2:28:17", "remaining_time": "2:31:12"} +{"current_steps": 917, "total_steps": 1850, "loss": 0.0639, "lr": 2.538207071839213e-06, "epoch": 4.956756756756757, "percentage": 49.57, "elapsed_time": "2:28:18", "remaining_time": "2:30:53"} +{"current_steps": 918, "total_steps": 1850, "loss": 0.1281, "lr": 2.533962119128487e-06, "epoch": 4.962162162162162, "percentage": 49.62, "elapsed_time": "2:28:22", "remaining_time": "2:30:37"} +{"current_steps": 919, "total_steps": 1850, "loss": 0.1771, "lr": 2.529717068479821e-06, "epoch": 4.967567567567568, "percentage": 49.68, "elapsed_time": "2:28:26", "remaining_time": "2:30:22"} +{"current_steps": 920, "total_steps": 1850, "loss": 0.2582, "lr": 2.5254719321348392e-06, "epoch": 4.972972972972973, "percentage": 49.73, "elapsed_time": "2:28:33", "remaining_time": "2:30:10"} +{"current_steps": 921, "total_steps": 1850, "loss": 0.3016, "lr": 2.5212267223354143e-06, "epoch": 4.978378378378379, "percentage": 49.78, "elapsed_time": "2:28:37", "remaining_time": "2:29:55"} +{"current_steps": 922, "total_steps": 1850, "loss": 0.2775, "lr": 2.5169814513236296e-06, "epoch": 4.9837837837837835, "percentage": 49.84, "elapsed_time": "2:28:42", "remaining_time": "2:29:40"} +{"current_steps": 923, "total_steps": 1850, "loss": 0.1246, "lr": 2.5127361313417447e-06, "epoch": 4.989189189189189, "percentage": 49.89, "elapsed_time": "2:28:44", "remaining_time": "2:29:22"} +{"current_steps": 924, "total_steps": 1850, "loss": 0.1732, "lr": 2.508490774632162e-06, "epoch": 4.994594594594595, "percentage": 49.95, "elapsed_time": "2:28:45", "remaining_time": "2:29:04"} +{"current_steps": 925, "total_steps": 1850, "loss": 0.1107, "lr": 2.5042453934373874e-06, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "2:28:47", "remaining_time": "2:28:47"} +{"current_steps": 926, "total_steps": 1850, "loss": 0.1074, "lr": 2.5e-06, "epoch": 5.005405405405406, "percentage": 50.05, "elapsed_time": "2:34:50", "remaining_time": "2:34:30"} +{"current_steps": 927, "total_steps": 1850, "loss": 0.0752, "lr": 2.4957546065626134e-06, "epoch": 5.010810810810811, "percentage": 50.11, "elapsed_time": "2:34:53", "remaining_time": "2:34:13"} +{"current_steps": 928, "total_steps": 1850, "loss": 0.0313, "lr": 2.491509225367839e-06, "epoch": 5.0162162162162165, "percentage": 50.16, "elapsed_time": "2:34:57", "remaining_time": "2:33:57"} +{"current_steps": 929, "total_steps": 1850, "loss": 0.0851, "lr": 2.487263868658256e-06, "epoch": 5.021621621621621, "percentage": 50.22, "elapsed_time": "2:35:04", "remaining_time": "2:33:44"} +{"current_steps": 930, "total_steps": 1850, "loss": 0.0443, "lr": 2.483018548676371e-06, "epoch": 5.027027027027027, "percentage": 50.27, "elapsed_time": "2:35:07", "remaining_time": "2:33:27"} +{"current_steps": 931, "total_steps": 1850, "loss": 0.056, "lr": 2.478773277664587e-06, "epoch": 5.032432432432432, "percentage": 50.32, "elapsed_time": "2:35:09", "remaining_time": "2:33:09"} +{"current_steps": 932, "total_steps": 1850, "loss": 0.1668, "lr": 2.4745280678651616e-06, "epoch": 5.037837837837838, "percentage": 50.38, "elapsed_time": "2:35:13", "remaining_time": "2:32:53"} +{"current_steps": 933, "total_steps": 1850, "loss": 0.0502, "lr": 2.47028293152018e-06, "epoch": 5.043243243243243, "percentage": 50.43, "elapsed_time": "2:35:16", "remaining_time": "2:32:36"} +{"current_steps": 934, "total_steps": 1850, "loss": 0.023, "lr": 2.4660378808715147e-06, "epoch": 5.048648648648649, "percentage": 50.49, "elapsed_time": "2:35:17", "remaining_time": "2:32:18"} +{"current_steps": 935, "total_steps": 1850, "loss": 0.1418, "lr": 2.4617929281607885e-06, "epoch": 5.054054054054054, "percentage": 50.54, "elapsed_time": "2:35:21", "remaining_time": "2:32:01"} +{"current_steps": 936, "total_steps": 1850, "loss": 0.1167, "lr": 2.457548085629345e-06, "epoch": 5.059459459459459, "percentage": 50.59, "elapsed_time": "2:35:23", "remaining_time": "2:31:44"} +{"current_steps": 937, "total_steps": 1850, "loss": 0.0781, "lr": 2.4533033655182072e-06, "epoch": 5.064864864864865, "percentage": 50.65, "elapsed_time": "2:35:24", "remaining_time": "2:31:25"} +{"current_steps": 938, "total_steps": 1850, "loss": 0.0799, "lr": 2.449058780068049e-06, "epoch": 5.07027027027027, "percentage": 50.7, "elapsed_time": "2:35:26", "remaining_time": "2:31:07"} +{"current_steps": 939, "total_steps": 1850, "loss": 0.0548, "lr": 2.444814341519152e-06, "epoch": 5.075675675675676, "percentage": 50.76, "elapsed_time": "2:35:30", "remaining_time": "2:30:52"} +{"current_steps": 940, "total_steps": 1850, "loss": 0.1218, "lr": 2.440570062111376e-06, "epoch": 5.081081081081081, "percentage": 50.81, "elapsed_time": "2:35:34", "remaining_time": "2:30:36"} +{"current_steps": 941, "total_steps": 1850, "loss": 0.0182, "lr": 2.436325954084122e-06, "epoch": 5.0864864864864865, "percentage": 50.86, "elapsed_time": "2:35:34", "remaining_time": "2:30:17"} +{"current_steps": 942, "total_steps": 1850, "loss": 0.0337, "lr": 2.4320820296762964e-06, "epoch": 5.091891891891892, "percentage": 50.92, "elapsed_time": "2:35:38", "remaining_time": "2:30:01"} +{"current_steps": 943, "total_steps": 1850, "loss": 0.0226, "lr": 2.4278383011262755e-06, "epoch": 5.097297297297297, "percentage": 50.97, "elapsed_time": "2:35:41", "remaining_time": "2:29:44"} +{"current_steps": 944, "total_steps": 1850, "loss": 0.0207, "lr": 2.4235947806718717e-06, "epoch": 5.102702702702703, "percentage": 51.03, "elapsed_time": "2:35:43", "remaining_time": "2:29:27"} +{"current_steps": 945, "total_steps": 1850, "loss": 0.1561, "lr": 2.4193514805502972e-06, "epoch": 5.108108108108108, "percentage": 51.08, "elapsed_time": "2:35:47", "remaining_time": "2:29:11"} +{"current_steps": 946, "total_steps": 1850, "loss": 0.1727, "lr": 2.4151084129981284e-06, "epoch": 5.113513513513514, "percentage": 51.14, "elapsed_time": "2:35:50", "remaining_time": "2:28:55"} +{"current_steps": 947, "total_steps": 1850, "loss": 0.1246, "lr": 2.4108655902512715e-06, "epoch": 5.118918918918919, "percentage": 51.19, "elapsed_time": "2:35:54", "remaining_time": "2:28:39"} +{"current_steps": 948, "total_steps": 1850, "loss": 0.1429, "lr": 2.406623024544926e-06, "epoch": 5.124324324324324, "percentage": 51.24, "elapsed_time": "2:35:56", "remaining_time": "2:28:22"} +{"current_steps": 949, "total_steps": 1850, "loss": 0.1569, "lr": 2.402380728113551e-06, "epoch": 5.12972972972973, "percentage": 51.3, "elapsed_time": "2:35:59", "remaining_time": "2:28:06"} +{"current_steps": 950, "total_steps": 1850, "loss": 0.1105, "lr": 2.3981387131908286e-06, "epoch": 5.135135135135135, "percentage": 51.35, "elapsed_time": "2:36:02", "remaining_time": "2:27:50"} +{"current_steps": 951, "total_steps": 1850, "loss": 0.3786, "lr": 2.39389699200963e-06, "epoch": 5.140540540540541, "percentage": 51.41, "elapsed_time": "2:36:06", "remaining_time": "2:27:34"} +{"current_steps": 952, "total_steps": 1850, "loss": 0.0826, "lr": 2.389655576801977e-06, "epoch": 5.145945945945946, "percentage": 51.46, "elapsed_time": "2:36:07", "remaining_time": "2:27:16"} +{"current_steps": 953, "total_steps": 1850, "loss": 0.0684, "lr": 2.3854144797990123e-06, "epoch": 5.151351351351352, "percentage": 51.51, "elapsed_time": "2:36:09", "remaining_time": "2:26:59"} +{"current_steps": 954, "total_steps": 1850, "loss": 0.0452, "lr": 2.3811737132309584e-06, "epoch": 5.1567567567567565, "percentage": 51.57, "elapsed_time": "2:36:11", "remaining_time": "2:26:41"} +{"current_steps": 955, "total_steps": 1850, "loss": 0.0465, "lr": 2.3769332893270856e-06, "epoch": 5.162162162162162, "percentage": 51.62, "elapsed_time": "2:36:14", "remaining_time": "2:26:25"} +{"current_steps": 956, "total_steps": 1850, "loss": 0.0551, "lr": 2.372693220315677e-06, "epoch": 5.167567567567567, "percentage": 51.68, "elapsed_time": "2:36:19", "remaining_time": "2:26:11"} +{"current_steps": 957, "total_steps": 1850, "loss": 0.0896, "lr": 2.36845351842399e-06, "epoch": 5.172972972972973, "percentage": 51.73, "elapsed_time": "2:36:26", "remaining_time": "2:25:59"} +{"current_steps": 958, "total_steps": 1850, "loss": 0.0565, "lr": 2.3642141958782267e-06, "epoch": 5.178378378378379, "percentage": 51.78, "elapsed_time": "2:36:32", "remaining_time": "2:25:45"} +{"current_steps": 959, "total_steps": 1850, "loss": 0.1563, "lr": 2.3599752649034935e-06, "epoch": 5.183783783783784, "percentage": 51.84, "elapsed_time": "2:36:34", "remaining_time": "2:25:28"} +{"current_steps": 960, "total_steps": 1850, "loss": 0.0236, "lr": 2.3557367377237663e-06, "epoch": 5.1891891891891895, "percentage": 51.89, "elapsed_time": "2:36:37", "remaining_time": "2:25:12"} +{"current_steps": 961, "total_steps": 1850, "loss": 0.0506, "lr": 2.351498626561858e-06, "epoch": 5.194594594594594, "percentage": 51.95, "elapsed_time": "2:36:42", "remaining_time": "2:24:58"} +{"current_steps": 962, "total_steps": 1850, "loss": 0.1001, "lr": 2.3472609436393827e-06, "epoch": 5.2, "percentage": 52.0, "elapsed_time": "2:36:44", "remaining_time": "2:24:41"} +{"current_steps": 963, "total_steps": 1850, "loss": 0.0951, "lr": 2.3430237011767166e-06, "epoch": 5.205405405405405, "percentage": 52.05, "elapsed_time": "2:36:48", "remaining_time": "2:24:25"} +{"current_steps": 964, "total_steps": 1850, "loss": 0.0824, "lr": 2.3387869113929695e-06, "epoch": 5.210810810810811, "percentage": 52.11, "elapsed_time": "2:36:51", "remaining_time": "2:24:09"} +{"current_steps": 965, "total_steps": 1850, "loss": 0.0485, "lr": 2.3345505865059427e-06, "epoch": 5.216216216216216, "percentage": 52.16, "elapsed_time": "2:36:54", "remaining_time": "2:23:53"} +{"current_steps": 966, "total_steps": 1850, "loss": 0.1516, "lr": 2.3303147387320985e-06, "epoch": 5.221621621621622, "percentage": 52.22, "elapsed_time": "2:36:57", "remaining_time": "2:23:37"} +{"current_steps": 967, "total_steps": 1850, "loss": 0.1664, "lr": 2.3260793802865227e-06, "epoch": 5.227027027027027, "percentage": 52.27, "elapsed_time": "2:37:00", "remaining_time": "2:23:22"} +{"current_steps": 968, "total_steps": 1850, "loss": 0.1127, "lr": 2.3218445233828904e-06, "epoch": 5.232432432432432, "percentage": 52.32, "elapsed_time": "2:37:03", "remaining_time": "2:23:06"} +{"current_steps": 969, "total_steps": 1850, "loss": 0.0445, "lr": 2.31761018023343e-06, "epoch": 5.237837837837838, "percentage": 52.38, "elapsed_time": "2:37:05", "remaining_time": "2:22:49"} +{"current_steps": 970, "total_steps": 1850, "loss": 0.1402, "lr": 2.3133763630488883e-06, "epoch": 5.243243243243243, "percentage": 52.43, "elapsed_time": "2:37:06", "remaining_time": "2:22:31"} +{"current_steps": 971, "total_steps": 1850, "loss": 0.0332, "lr": 2.3091430840384964e-06, "epoch": 5.248648648648649, "percentage": 52.49, "elapsed_time": "2:37:09", "remaining_time": "2:22:16"} +{"current_steps": 972, "total_steps": 1850, "loss": 0.1266, "lr": 2.304910355409932e-06, "epoch": 5.254054054054054, "percentage": 52.54, "elapsed_time": "2:37:12", "remaining_time": "2:22:00"} +{"current_steps": 973, "total_steps": 1850, "loss": 0.0281, "lr": 2.3006781893692863e-06, "epoch": 5.2594594594594595, "percentage": 52.59, "elapsed_time": "2:37:14", "remaining_time": "2:21:43"} +{"current_steps": 974, "total_steps": 1850, "loss": 0.0238, "lr": 2.2964465981210283e-06, "epoch": 5.264864864864865, "percentage": 52.65, "elapsed_time": "2:37:15", "remaining_time": "2:21:26"} +{"current_steps": 975, "total_steps": 1850, "loss": 0.0828, "lr": 2.2922155938679695e-06, "epoch": 5.27027027027027, "percentage": 52.7, "elapsed_time": "2:37:18", "remaining_time": "2:21:10"} +{"current_steps": 976, "total_steps": 1850, "loss": 0.1874, "lr": 2.287985188811228e-06, "epoch": 5.275675675675676, "percentage": 52.76, "elapsed_time": "2:37:20", "remaining_time": "2:20:53"} +{"current_steps": 977, "total_steps": 1850, "loss": 0.0413, "lr": 2.2837553951501935e-06, "epoch": 5.281081081081081, "percentage": 52.81, "elapsed_time": "2:37:24", "remaining_time": "2:20:39"} +{"current_steps": 978, "total_steps": 1850, "loss": 0.0909, "lr": 2.279526225082495e-06, "epoch": 5.286486486486487, "percentage": 52.86, "elapsed_time": "2:37:30", "remaining_time": "2:20:26"} +{"current_steps": 979, "total_steps": 1850, "loss": 0.0798, "lr": 2.275297690803962e-06, "epoch": 5.291891891891892, "percentage": 52.92, "elapsed_time": "2:37:31", "remaining_time": "2:20:08"} +{"current_steps": 980, "total_steps": 1850, "loss": 0.1456, "lr": 2.271069804508589e-06, "epoch": 5.297297297297297, "percentage": 52.97, "elapsed_time": "2:37:35", "remaining_time": "2:19:54"} +{"current_steps": 981, "total_steps": 1850, "loss": 0.085, "lr": 2.266842578388504e-06, "epoch": 5.302702702702703, "percentage": 53.03, "elapsed_time": "2:37:42", "remaining_time": "2:19:42"} +{"current_steps": 982, "total_steps": 1850, "loss": 0.0885, "lr": 2.2626160246339303e-06, "epoch": 5.308108108108108, "percentage": 53.08, "elapsed_time": "2:37:49", "remaining_time": "2:19:29"} +{"current_steps": 983, "total_steps": 1850, "loss": 0.1543, "lr": 2.2583901554331513e-06, "epoch": 5.313513513513514, "percentage": 53.14, "elapsed_time": "2:37:54", "remaining_time": "2:19:16"} +{"current_steps": 984, "total_steps": 1850, "loss": 0.06, "lr": 2.2541649829724783e-06, "epoch": 5.318918918918919, "percentage": 53.19, "elapsed_time": "2:37:57", "remaining_time": "2:19:00"} +{"current_steps": 985, "total_steps": 1850, "loss": 0.0518, "lr": 2.249940519436212e-06, "epoch": 5.324324324324325, "percentage": 53.24, "elapsed_time": "2:37:59", "remaining_time": "2:18:44"} +{"current_steps": 986, "total_steps": 1850, "loss": 0.1542, "lr": 2.2457167770066104e-06, "epoch": 5.3297297297297295, "percentage": 53.3, "elapsed_time": "2:38:03", "remaining_time": "2:18:30"} +{"current_steps": 987, "total_steps": 1850, "loss": 0.0338, "lr": 2.2414937678638495e-06, "epoch": 5.335135135135135, "percentage": 53.35, "elapsed_time": "2:38:07", "remaining_time": "2:18:15"} +{"current_steps": 988, "total_steps": 1850, "loss": 0.0204, "lr": 2.2372715041859925e-06, "epoch": 5.34054054054054, "percentage": 53.41, "elapsed_time": "2:38:08", "remaining_time": "2:17:58"} +{"current_steps": 989, "total_steps": 1850, "loss": 0.129, "lr": 2.2330499981489524e-06, "epoch": 5.345945945945946, "percentage": 53.46, "elapsed_time": "2:38:12", "remaining_time": "2:17:43"} +{"current_steps": 990, "total_steps": 1850, "loss": 0.0307, "lr": 2.2288292619264566e-06, "epoch": 5.351351351351352, "percentage": 53.51, "elapsed_time": "2:38:14", "remaining_time": "2:17:27"} +{"current_steps": 991, "total_steps": 1850, "loss": 0.0374, "lr": 2.2246093076900145e-06, "epoch": 5.356756756756757, "percentage": 53.57, "elapsed_time": "2:38:18", "remaining_time": "2:17:13"} +{"current_steps": 992, "total_steps": 1850, "loss": 0.0265, "lr": 2.220390147608878e-06, "epoch": 5.3621621621621625, "percentage": 53.62, "elapsed_time": "2:38:21", "remaining_time": "2:16:58"} +{"current_steps": 993, "total_steps": 1850, "loss": 0.0468, "lr": 2.2161717938500112e-06, "epoch": 5.367567567567567, "percentage": 53.68, "elapsed_time": "2:38:24", "remaining_time": "2:16:43"} +{"current_steps": 994, "total_steps": 1850, "loss": 0.1118, "lr": 2.2119542585780513e-06, "epoch": 5.372972972972973, "percentage": 53.73, "elapsed_time": "2:38:31", "remaining_time": "2:16:30"} +{"current_steps": 995, "total_steps": 1850, "loss": 0.2056, "lr": 2.2077375539552764e-06, "epoch": 5.378378378378378, "percentage": 53.78, "elapsed_time": "2:38:35", "remaining_time": "2:16:16"} +{"current_steps": 996, "total_steps": 1850, "loss": 0.0437, "lr": 2.203521692141568e-06, "epoch": 5.383783783783784, "percentage": 53.84, "elapsed_time": "2:38:37", "remaining_time": "2:16:00"} +{"current_steps": 997, "total_steps": 1850, "loss": 0.1981, "lr": 2.199306685294377e-06, "epoch": 5.389189189189189, "percentage": 53.89, "elapsed_time": "2:38:40", "remaining_time": "2:15:45"} +{"current_steps": 998, "total_steps": 1850, "loss": 0.0756, "lr": 2.1950925455686906e-06, "epoch": 5.394594594594595, "percentage": 53.95, "elapsed_time": "2:38:42", "remaining_time": "2:15:29"} +{"current_steps": 999, "total_steps": 1850, "loss": 0.0998, "lr": 2.1908792851169954e-06, "epoch": 5.4, "percentage": 54.0, "elapsed_time": "2:38:48", "remaining_time": "2:15:16"} +{"current_steps": 1000, "total_steps": 1850, "loss": 0.0223, "lr": 2.186666916089239e-06, "epoch": 5.405405405405405, "percentage": 54.05, "elapsed_time": "2:38:50", "remaining_time": "2:15:00"} +{"current_steps": 1001, "total_steps": 1850, "loss": 0.0489, "lr": 2.1824554506328033e-06, "epoch": 5.410810810810811, "percentage": 54.11, "elapsed_time": "2:38:55", "remaining_time": "2:14:47"} +{"current_steps": 1002, "total_steps": 1850, "loss": 0.0321, "lr": 2.17824490089246e-06, "epoch": 5.416216216216216, "percentage": 54.16, "elapsed_time": "2:38:58", "remaining_time": "2:14:32"} +{"current_steps": 1003, "total_steps": 1850, "loss": 0.0167, "lr": 2.174035279010343e-06, "epoch": 5.421621621621622, "percentage": 54.22, "elapsed_time": "2:39:00", "remaining_time": "2:14:16"} +{"current_steps": 1004, "total_steps": 1850, "loss": 0.0588, "lr": 2.1698265971259104e-06, "epoch": 5.427027027027027, "percentage": 54.27, "elapsed_time": "2:39:05", "remaining_time": "2:14:03"} +{"current_steps": 1005, "total_steps": 1850, "loss": 0.0868, "lr": 2.1656188673759065e-06, "epoch": 5.4324324324324325, "percentage": 54.32, "elapsed_time": "2:39:13", "remaining_time": "2:13:52"} +{"current_steps": 1006, "total_steps": 1850, "loss": 0.1131, "lr": 2.1614121018943346e-06, "epoch": 5.437837837837838, "percentage": 54.38, "elapsed_time": "2:39:16", "remaining_time": "2:13:37"} +{"current_steps": 1007, "total_steps": 1850, "loss": 0.0285, "lr": 2.1572063128124133e-06, "epoch": 5.443243243243243, "percentage": 54.43, "elapsed_time": "2:39:20", "remaining_time": "2:13:23"} +{"current_steps": 1008, "total_steps": 1850, "loss": 0.0303, "lr": 2.153001512258548e-06, "epoch": 5.448648648648649, "percentage": 54.49, "elapsed_time": "2:39:21", "remaining_time": "2:13:07"} +{"current_steps": 1009, "total_steps": 1850, "loss": 0.3278, "lr": 2.1487977123582922e-06, "epoch": 5.454054054054054, "percentage": 54.54, "elapsed_time": "2:39:25", "remaining_time": "2:12:52"} +{"current_steps": 1010, "total_steps": 1850, "loss": 0.0346, "lr": 2.144594925234314e-06, "epoch": 5.45945945945946, "percentage": 54.59, "elapsed_time": "2:39:28", "remaining_time": "2:12:37"} +{"current_steps": 1011, "total_steps": 1850, "loss": 0.0874, "lr": 2.140393163006362e-06, "epoch": 5.464864864864865, "percentage": 54.65, "elapsed_time": "2:39:34", "remaining_time": "2:12:25"} +{"current_steps": 1012, "total_steps": 1850, "loss": 0.0194, "lr": 2.1361924377912266e-06, "epoch": 5.47027027027027, "percentage": 54.7, "elapsed_time": "2:39:35", "remaining_time": "2:12:09"} +{"current_steps": 1013, "total_steps": 1850, "loss": 0.1193, "lr": 2.1319927617027112e-06, "epoch": 5.475675675675676, "percentage": 54.76, "elapsed_time": "2:39:37", "remaining_time": "2:11:53"} +{"current_steps": 1014, "total_steps": 1850, "loss": 0.0331, "lr": 2.1277941468515908e-06, "epoch": 5.481081081081081, "percentage": 54.81, "elapsed_time": "2:39:40", "remaining_time": "2:11:38"} +{"current_steps": 1015, "total_steps": 1850, "loss": 0.0723, "lr": 2.123596605345582e-06, "epoch": 5.486486486486487, "percentage": 54.86, "elapsed_time": "2:39:42", "remaining_time": "2:11:22"} +{"current_steps": 1016, "total_steps": 1850, "loss": 0.0751, "lr": 2.119400149289305e-06, "epoch": 5.491891891891892, "percentage": 54.92, "elapsed_time": "2:39:44", "remaining_time": "2:11:07"} +{"current_steps": 1017, "total_steps": 1850, "loss": 0.0265, "lr": 2.11520479078425e-06, "epoch": 5.4972972972972975, "percentage": 54.97, "elapsed_time": "2:39:46", "remaining_time": "2:10:52"} +{"current_steps": 1018, "total_steps": 1850, "loss": 0.1023, "lr": 2.111010541928743e-06, "epoch": 5.5027027027027025, "percentage": 55.03, "elapsed_time": "2:39:48", "remaining_time": "2:10:36"} +{"current_steps": 1019, "total_steps": 1850, "loss": 0.0831, "lr": 2.10681741481791e-06, "epoch": 5.508108108108108, "percentage": 55.08, "elapsed_time": "2:39:52", "remaining_time": "2:10:22"} +{"current_steps": 1020, "total_steps": 1850, "loss": 0.1258, "lr": 2.1026254215436408e-06, "epoch": 5.513513513513513, "percentage": 55.14, "elapsed_time": "2:39:55", "remaining_time": "2:10:08"} +{"current_steps": 1021, "total_steps": 1850, "loss": 0.0926, "lr": 2.098434574194557e-06, "epoch": 5.518918918918919, "percentage": 55.19, "elapsed_time": "2:39:58", "remaining_time": "2:09:53"} +{"current_steps": 1022, "total_steps": 1850, "loss": 0.0306, "lr": 2.094244884855974e-06, "epoch": 5.524324324324324, "percentage": 55.24, "elapsed_time": "2:40:01", "remaining_time": "2:09:38"} +{"current_steps": 1023, "total_steps": 1850, "loss": 0.1374, "lr": 2.0900563656098706e-06, "epoch": 5.52972972972973, "percentage": 55.3, "elapsed_time": "2:40:04", "remaining_time": "2:09:24"} +{"current_steps": 1024, "total_steps": 1850, "loss": 0.1173, "lr": 2.085869028534848e-06, "epoch": 5.535135135135135, "percentage": 55.35, "elapsed_time": "2:40:07", "remaining_time": "2:09:09"} +{"current_steps": 1025, "total_steps": 1850, "loss": 0.146, "lr": 2.0816828857061e-06, "epoch": 5.54054054054054, "percentage": 55.41, "elapsed_time": "2:40:12", "remaining_time": "2:08:57"} +{"current_steps": 1026, "total_steps": 1850, "loss": 0.1542, "lr": 2.077497949195378e-06, "epoch": 5.545945945945946, "percentage": 55.46, "elapsed_time": "2:40:14", "remaining_time": "2:08:41"} +{"current_steps": 1027, "total_steps": 1850, "loss": 0.0699, "lr": 2.073314231070951e-06, "epoch": 5.551351351351351, "percentage": 55.51, "elapsed_time": "2:40:20", "remaining_time": "2:08:29"} +{"current_steps": 1028, "total_steps": 1850, "loss": 0.1429, "lr": 2.069131743397578e-06, "epoch": 5.556756756756757, "percentage": 55.57, "elapsed_time": "2:40:21", "remaining_time": "2:08:13"} +{"current_steps": 1029, "total_steps": 1850, "loss": 0.1203, "lr": 2.0649504982364674e-06, "epoch": 5.562162162162162, "percentage": 55.62, "elapsed_time": "2:40:25", "remaining_time": "2:07:59"} +{"current_steps": 1030, "total_steps": 1850, "loss": 0.1078, "lr": 2.0607705076452465e-06, "epoch": 5.5675675675675675, "percentage": 55.68, "elapsed_time": "2:40:28", "remaining_time": "2:07:45"} +{"current_steps": 1031, "total_steps": 1850, "loss": 0.0881, "lr": 2.056591783677923e-06, "epoch": 5.572972972972973, "percentage": 55.73, "elapsed_time": "2:40:33", "remaining_time": "2:07:32"} +{"current_steps": 1032, "total_steps": 1850, "loss": 0.0586, "lr": 2.0524143383848525e-06, "epoch": 5.578378378378378, "percentage": 55.78, "elapsed_time": "2:40:35", "remaining_time": "2:07:17"} +{"current_steps": 1033, "total_steps": 1850, "loss": 0.3671, "lr": 2.048238183812704e-06, "epoch": 5.583783783783784, "percentage": 55.84, "elapsed_time": "2:40:37", "remaining_time": "2:07:02"} +{"current_steps": 1034, "total_steps": 1850, "loss": 0.048, "lr": 2.0440633320044224e-06, "epoch": 5.589189189189189, "percentage": 55.89, "elapsed_time": "2:40:40", "remaining_time": "2:06:47"} +{"current_steps": 1035, "total_steps": 1850, "loss": 0.2091, "lr": 2.0398897949991992e-06, "epoch": 5.594594594594595, "percentage": 55.95, "elapsed_time": "2:40:43", "remaining_time": "2:06:33"} +{"current_steps": 1036, "total_steps": 1850, "loss": 0.1295, "lr": 2.0357175848324306e-06, "epoch": 5.6, "percentage": 56.0, "elapsed_time": "2:40:45", "remaining_time": "2:06:18"} +{"current_steps": 1037, "total_steps": 1850, "loss": 0.0504, "lr": 2.031546713535688e-06, "epoch": 5.605405405405405, "percentage": 56.05, "elapsed_time": "2:40:46", "remaining_time": "2:06:02"} +{"current_steps": 1038, "total_steps": 1850, "loss": 0.1816, "lr": 2.027377193136684e-06, "epoch": 5.610810810810811, "percentage": 56.11, "elapsed_time": "2:40:50", "remaining_time": "2:05:49"} +{"current_steps": 1039, "total_steps": 1850, "loss": 0.0392, "lr": 2.0232090356592333e-06, "epoch": 5.616216216216216, "percentage": 56.16, "elapsed_time": "2:40:54", "remaining_time": "2:05:36"} +{"current_steps": 1040, "total_steps": 1850, "loss": 0.0273, "lr": 2.0190422531232186e-06, "epoch": 5.621621621621622, "percentage": 56.22, "elapsed_time": "2:40:56", "remaining_time": "2:05:20"} +{"current_steps": 1041, "total_steps": 1850, "loss": 0.0672, "lr": 2.014876857544562e-06, "epoch": 5.627027027027027, "percentage": 56.27, "elapsed_time": "2:40:59", "remaining_time": "2:05:06"} +{"current_steps": 1042, "total_steps": 1850, "loss": 0.0749, "lr": 2.0107128609351817e-06, "epoch": 5.632432432432433, "percentage": 56.32, "elapsed_time": "2:41:01", "remaining_time": "2:04:51"} +{"current_steps": 1043, "total_steps": 1850, "loss": 0.0713, "lr": 2.006550275302965e-06, "epoch": 5.6378378378378375, "percentage": 56.38, "elapsed_time": "2:41:04", "remaining_time": "2:04:37"} +{"current_steps": 1044, "total_steps": 1850, "loss": 0.0547, "lr": 2.002389112651728e-06, "epoch": 5.643243243243243, "percentage": 56.43, "elapsed_time": "2:41:07", "remaining_time": "2:04:23"} +{"current_steps": 1045, "total_steps": 1850, "loss": 0.0304, "lr": 1.9982293849811852e-06, "epoch": 5.648648648648649, "percentage": 56.49, "elapsed_time": "2:41:11", "remaining_time": "2:04:10"} +{"current_steps": 1046, "total_steps": 1850, "loss": 0.0227, "lr": 1.994071104286911e-06, "epoch": 5.654054054054054, "percentage": 56.54, "elapsed_time": "2:41:13", "remaining_time": "2:03:55"} +{"current_steps": 1047, "total_steps": 1850, "loss": 0.0811, "lr": 1.9899142825603078e-06, "epoch": 5.65945945945946, "percentage": 56.59, "elapsed_time": "2:41:16", "remaining_time": "2:03:41"} +{"current_steps": 1048, "total_steps": 1850, "loss": 0.0292, "lr": 1.9857589317885727e-06, "epoch": 5.664864864864865, "percentage": 56.65, "elapsed_time": "2:41:18", "remaining_time": "2:03:26"} +{"current_steps": 1049, "total_steps": 1850, "loss": 0.0386, "lr": 1.9816050639546566e-06, "epoch": 5.6702702702702705, "percentage": 56.7, "elapsed_time": "2:41:21", "remaining_time": "2:03:12"} +{"current_steps": 1050, "total_steps": 1850, "loss": 0.1448, "lr": 1.977452691037239e-06, "epoch": 5.675675675675675, "percentage": 56.76, "elapsed_time": "2:41:25", "remaining_time": "2:02:59"} +{"current_steps": 1051, "total_steps": 1850, "loss": 0.0451, "lr": 1.973301825010685e-06, "epoch": 5.681081081081081, "percentage": 56.81, "elapsed_time": "2:41:27", "remaining_time": "2:02:44"} +{"current_steps": 1052, "total_steps": 1850, "loss": 0.0708, "lr": 1.9691524778450145e-06, "epoch": 5.686486486486486, "percentage": 56.86, "elapsed_time": "2:41:31", "remaining_time": "2:02:31"} +{"current_steps": 1053, "total_steps": 1850, "loss": 0.0311, "lr": 1.96500466150587e-06, "epoch": 5.691891891891892, "percentage": 56.92, "elapsed_time": "2:41:34", "remaining_time": "2:02:17"} +{"current_steps": 1054, "total_steps": 1850, "loss": 0.0728, "lr": 1.960858387954476e-06, "epoch": 5.697297297297297, "percentage": 56.97, "elapsed_time": "2:41:36", "remaining_time": "2:02:03"} +{"current_steps": 1055, "total_steps": 1850, "loss": 0.1429, "lr": 1.956713669147611e-06, "epoch": 5.702702702702703, "percentage": 57.03, "elapsed_time": "2:41:43", "remaining_time": "2:01:51"} +{"current_steps": 1056, "total_steps": 1850, "loss": 0.0702, "lr": 1.9525705170375674e-06, "epoch": 5.708108108108108, "percentage": 57.08, "elapsed_time": "2:41:46", "remaining_time": "2:01:38"} +{"current_steps": 1057, "total_steps": 1850, "loss": 0.0934, "lr": 1.948428943572121e-06, "epoch": 5.713513513513513, "percentage": 57.14, "elapsed_time": "2:41:52", "remaining_time": "2:01:26"} +{"current_steps": 1058, "total_steps": 1850, "loss": 0.0327, "lr": 1.944288960694497e-06, "epoch": 5.718918918918919, "percentage": 57.19, "elapsed_time": "2:41:54", "remaining_time": "2:01:12"} +{"current_steps": 1059, "total_steps": 1850, "loss": 0.1025, "lr": 1.9401505803433308e-06, "epoch": 5.724324324324324, "percentage": 57.24, "elapsed_time": "2:41:59", "remaining_time": "2:00:59"} +{"current_steps": 1060, "total_steps": 1850, "loss": 0.0825, "lr": 1.9360138144526363e-06, "epoch": 5.72972972972973, "percentage": 57.3, "elapsed_time": "2:42:02", "remaining_time": "2:00:46"} +{"current_steps": 1061, "total_steps": 1850, "loss": 0.164, "lr": 1.9318786749517754e-06, "epoch": 5.735135135135135, "percentage": 57.35, "elapsed_time": "2:42:06", "remaining_time": "2:00:32"} +{"current_steps": 1062, "total_steps": 1850, "loss": 0.0574, "lr": 1.9277451737654154e-06, "epoch": 5.7405405405405405, "percentage": 57.41, "elapsed_time": "2:42:08", "remaining_time": "2:00:18"} +{"current_steps": 1063, "total_steps": 1850, "loss": 0.2916, "lr": 1.923613322813503e-06, "epoch": 5.745945945945946, "percentage": 57.46, "elapsed_time": "2:42:11", "remaining_time": "2:00:04"} +{"current_steps": 1064, "total_steps": 1850, "loss": 0.0626, "lr": 1.9194831340112228e-06, "epoch": 5.751351351351351, "percentage": 57.51, "elapsed_time": "2:42:15", "remaining_time": "1:59:51"} +{"current_steps": 1065, "total_steps": 1850, "loss": 0.0544, "lr": 1.915354619268969e-06, "epoch": 5.756756756756757, "percentage": 57.57, "elapsed_time": "2:42:19", "remaining_time": "1:59:39"} +{"current_steps": 1066, "total_steps": 1850, "loss": 0.0145, "lr": 1.9112277904923064e-06, "epoch": 5.762162162162162, "percentage": 57.62, "elapsed_time": "2:42:20", "remaining_time": "1:59:24"} +{"current_steps": 1067, "total_steps": 1850, "loss": 0.0335, "lr": 1.9071026595819387e-06, "epoch": 5.767567567567568, "percentage": 57.68, "elapsed_time": "2:42:22", "remaining_time": "1:59:09"} +{"current_steps": 1068, "total_steps": 1850, "loss": 0.1385, "lr": 1.902979238433673e-06, "epoch": 5.772972972972973, "percentage": 57.73, "elapsed_time": "2:42:25", "remaining_time": "1:58:55"} +{"current_steps": 1069, "total_steps": 1850, "loss": 0.0523, "lr": 1.8988575389383853e-06, "epoch": 5.778378378378378, "percentage": 57.78, "elapsed_time": "2:42:28", "remaining_time": "1:58:42"} +{"current_steps": 1070, "total_steps": 1850, "loss": 0.171, "lr": 1.8947375729819894e-06, "epoch": 5.783783783783784, "percentage": 57.84, "elapsed_time": "2:42:32", "remaining_time": "1:58:29"} +{"current_steps": 1071, "total_steps": 1850, "loss": 0.0431, "lr": 1.8906193524453964e-06, "epoch": 5.789189189189189, "percentage": 57.89, "elapsed_time": "2:42:35", "remaining_time": "1:58:15"} +{"current_steps": 1072, "total_steps": 1850, "loss": 0.0157, "lr": 1.886502889204487e-06, "epoch": 5.794594594594595, "percentage": 57.95, "elapsed_time": "2:42:37", "remaining_time": "1:58:01"} +{"current_steps": 1073, "total_steps": 1850, "loss": 0.0892, "lr": 1.882388195130073e-06, "epoch": 5.8, "percentage": 58.0, "elapsed_time": "2:42:39", "remaining_time": "1:57:46"} +{"current_steps": 1074, "total_steps": 1850, "loss": 0.0376, "lr": 1.8782752820878636e-06, "epoch": 5.805405405405406, "percentage": 58.05, "elapsed_time": "2:42:43", "remaining_time": "1:57:34"} +{"current_steps": 1075, "total_steps": 1850, "loss": 0.1174, "lr": 1.8741641619384343e-06, "epoch": 5.8108108108108105, "percentage": 58.11, "elapsed_time": "2:42:44", "remaining_time": "1:57:19"} +{"current_steps": 1076, "total_steps": 1850, "loss": 0.0191, "lr": 1.8700548465371877e-06, "epoch": 5.816216216216216, "percentage": 58.16, "elapsed_time": "2:42:46", "remaining_time": "1:57:05"} +{"current_steps": 1077, "total_steps": 1850, "loss": 0.1243, "lr": 1.8659473477343233e-06, "epoch": 5.821621621621622, "percentage": 58.22, "elapsed_time": "2:42:51", "remaining_time": "1:56:52"} +{"current_steps": 1078, "total_steps": 1850, "loss": 0.1457, "lr": 1.8618416773748032e-06, "epoch": 5.827027027027027, "percentage": 58.27, "elapsed_time": "2:42:53", "remaining_time": "1:56:39"} +{"current_steps": 1079, "total_steps": 1850, "loss": 0.0366, "lr": 1.8577378472983148e-06, "epoch": 5.832432432432433, "percentage": 58.32, "elapsed_time": "2:42:55", "remaining_time": "1:56:25"} +{"current_steps": 1080, "total_steps": 1850, "loss": 0.065, "lr": 1.8536358693392398e-06, "epoch": 5.837837837837838, "percentage": 58.38, "elapsed_time": "2:42:58", "remaining_time": "1:56:11"} +{"current_steps": 1081, "total_steps": 1850, "loss": 0.1902, "lr": 1.8495357553266176e-06, "epoch": 5.8432432432432435, "percentage": 58.43, "elapsed_time": "2:43:01", "remaining_time": "1:55:58"} +{"current_steps": 1082, "total_steps": 1850, "loss": 0.0372, "lr": 1.8454375170841133e-06, "epoch": 5.848648648648648, "percentage": 58.49, "elapsed_time": "2:43:05", "remaining_time": "1:55:45"} +{"current_steps": 1083, "total_steps": 1850, "loss": 0.0942, "lr": 1.841341166429983e-06, "epoch": 5.854054054054054, "percentage": 58.54, "elapsed_time": "2:43:07", "remaining_time": "1:55:31"} +{"current_steps": 1084, "total_steps": 1850, "loss": 0.2317, "lr": 1.8372467151770391e-06, "epoch": 5.859459459459459, "percentage": 58.59, "elapsed_time": "2:43:10", "remaining_time": "1:55:18"} +{"current_steps": 1085, "total_steps": 1850, "loss": 0.1935, "lr": 1.8331541751326168e-06, "epoch": 5.864864864864865, "percentage": 58.65, "elapsed_time": "2:43:14", "remaining_time": "1:55:05"} +{"current_steps": 1086, "total_steps": 1850, "loss": 0.0905, "lr": 1.8290635580985395e-06, "epoch": 5.87027027027027, "percentage": 58.7, "elapsed_time": "2:43:16", "remaining_time": "1:54:51"} +{"current_steps": 1087, "total_steps": 1850, "loss": 0.0931, "lr": 1.8249748758710856e-06, "epoch": 5.875675675675676, "percentage": 58.76, "elapsed_time": "2:43:22", "remaining_time": "1:54:40"} +{"current_steps": 1088, "total_steps": 1850, "loss": 0.0878, "lr": 1.8208881402409542e-06, "epoch": 5.881081081081081, "percentage": 58.81, "elapsed_time": "2:43:29", "remaining_time": "1:54:30"} +{"current_steps": 1089, "total_steps": 1850, "loss": 0.1317, "lr": 1.8168033629932296e-06, "epoch": 5.886486486486486, "percentage": 58.86, "elapsed_time": "2:43:32", "remaining_time": "1:54:16"} +{"current_steps": 1090, "total_steps": 1850, "loss": 0.027, "lr": 1.8127205559073507e-06, "epoch": 5.891891891891892, "percentage": 58.92, "elapsed_time": "2:43:34", "remaining_time": "1:54:03"} +{"current_steps": 1091, "total_steps": 1850, "loss": 0.0872, "lr": 1.8086397307570724e-06, "epoch": 5.897297297297297, "percentage": 58.97, "elapsed_time": "2:43:37", "remaining_time": "1:53:49"} +{"current_steps": 1092, "total_steps": 1850, "loss": 0.0821, "lr": 1.8045608993104373e-06, "epoch": 5.902702702702703, "percentage": 59.03, "elapsed_time": "2:43:39", "remaining_time": "1:53:36"} +{"current_steps": 1093, "total_steps": 1850, "loss": 0.0327, "lr": 1.8004840733297365e-06, "epoch": 5.908108108108108, "percentage": 59.08, "elapsed_time": "2:43:41", "remaining_time": "1:53:22"} +{"current_steps": 1094, "total_steps": 1850, "loss": 0.0497, "lr": 1.7964092645714777e-06, "epoch": 5.9135135135135135, "percentage": 59.14, "elapsed_time": "2:43:44", "remaining_time": "1:53:09"} +{"current_steps": 1095, "total_steps": 1850, "loss": 0.0307, "lr": 1.7923364847863527e-06, "epoch": 5.918918918918919, "percentage": 59.19, "elapsed_time": "2:43:46", "remaining_time": "1:52:55"} +{"current_steps": 1096, "total_steps": 1850, "loss": 0.0897, "lr": 1.7882657457192015e-06, "epoch": 5.924324324324324, "percentage": 59.24, "elapsed_time": "2:43:47", "remaining_time": "1:52:41"} +{"current_steps": 1097, "total_steps": 1850, "loss": 0.1545, "lr": 1.784197059108979e-06, "epoch": 5.92972972972973, "percentage": 59.3, "elapsed_time": "2:43:49", "remaining_time": "1:52:27"} +{"current_steps": 1098, "total_steps": 1850, "loss": 0.0509, "lr": 1.7801304366887235e-06, "epoch": 5.935135135135135, "percentage": 59.35, "elapsed_time": "2:43:52", "remaining_time": "1:52:14"} +{"current_steps": 1099, "total_steps": 1850, "loss": 0.0821, "lr": 1.776065890185517e-06, "epoch": 5.940540540540541, "percentage": 59.41, "elapsed_time": "2:43:53", "remaining_time": "1:51:59"} +{"current_steps": 1100, "total_steps": 1850, "loss": 0.0182, "lr": 1.7720034313204582e-06, "epoch": 5.945945945945946, "percentage": 59.46, "elapsed_time": "2:43:56", "remaining_time": "1:51:46"} +{"current_steps": 1101, "total_steps": 1850, "loss": 0.1027, "lr": 1.7679430718086244e-06, "epoch": 5.951351351351351, "percentage": 59.51, "elapsed_time": "2:44:01", "remaining_time": "1:51:35"} +{"current_steps": 1102, "total_steps": 1850, "loss": 0.0413, "lr": 1.763884823359038e-06, "epoch": 5.956756756756757, "percentage": 59.57, "elapsed_time": "2:44:04", "remaining_time": "1:51:21"} +{"current_steps": 1103, "total_steps": 1850, "loss": 0.1079, "lr": 1.759828697674636e-06, "epoch": 5.962162162162162, "percentage": 59.62, "elapsed_time": "2:44:06", "remaining_time": "1:51:08"} +{"current_steps": 1104, "total_steps": 1850, "loss": 0.0952, "lr": 1.7557747064522312e-06, "epoch": 5.967567567567568, "percentage": 59.68, "elapsed_time": "2:44:08", "remaining_time": "1:50:55"} +{"current_steps": 1105, "total_steps": 1850, "loss": 0.3393, "lr": 1.7517228613824836e-06, "epoch": 5.972972972972973, "percentage": 59.73, "elapsed_time": "2:44:11", "remaining_time": "1:50:41"} +{"current_steps": 1106, "total_steps": 1850, "loss": 0.0207, "lr": 1.747673174149862e-06, "epoch": 5.978378378378379, "percentage": 59.78, "elapsed_time": "2:44:15", "remaining_time": "1:50:29"} +{"current_steps": 1107, "total_steps": 1850, "loss": 0.1708, "lr": 1.743625656432615e-06, "epoch": 5.9837837837837835, "percentage": 59.84, "elapsed_time": "2:44:20", "remaining_time": "1:50:18"} +{"current_steps": 1108, "total_steps": 1850, "loss": 0.0569, "lr": 1.7395803199027325e-06, "epoch": 5.989189189189189, "percentage": 59.89, "elapsed_time": "2:44:25", "remaining_time": "1:50:06"} +{"current_steps": 1109, "total_steps": 1850, "loss": 0.0861, "lr": 1.7355371762259155e-06, "epoch": 5.994594594594595, "percentage": 59.95, "elapsed_time": "2:44:27", "remaining_time": "1:49:52"} +{"current_steps": 1110, "total_steps": 1850, "loss": 0.0571, "lr": 1.7314962370615423e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "2:44:34", "remaining_time": "1:49:42"} +{"current_steps": 1111, "total_steps": 1850, "loss": 0.0215, "lr": 1.7274575140626318e-06, "epoch": 6.005405405405406, "percentage": 60.05, "elapsed_time": "2:49:13", "remaining_time": "1:52:33"} +{"current_steps": 1112, "total_steps": 1850, "loss": 0.074, "lr": 1.7234210188758144e-06, "epoch": 6.010810810810811, "percentage": 60.11, "elapsed_time": "2:49:18", "remaining_time": "1:52:22"} +{"current_steps": 1113, "total_steps": 1850, "loss": 0.0481, "lr": 1.7193867631412942e-06, "epoch": 6.0162162162162165, "percentage": 60.16, "elapsed_time": "2:49:20", "remaining_time": "1:52:07"} +{"current_steps": 1114, "total_steps": 1850, "loss": 0.0253, "lr": 1.7153547584928185e-06, "epoch": 6.021621621621621, "percentage": 60.22, "elapsed_time": "2:49:22", "remaining_time": "1:51:54"} +{"current_steps": 1115, "total_steps": 1850, "loss": 0.0231, "lr": 1.7113250165576422e-06, "epoch": 6.027027027027027, "percentage": 60.27, "elapsed_time": "2:49:24", "remaining_time": "1:51:40"} +{"current_steps": 1116, "total_steps": 1850, "loss": 0.0517, "lr": 1.7072975489564958e-06, "epoch": 6.032432432432432, "percentage": 60.32, "elapsed_time": "2:49:27", "remaining_time": "1:51:26"} +{"current_steps": 1117, "total_steps": 1850, "loss": 0.0099, "lr": 1.703272367303551e-06, "epoch": 6.037837837837838, "percentage": 60.38, "elapsed_time": "2:49:30", "remaining_time": "1:51:13"} +{"current_steps": 1118, "total_steps": 1850, "loss": 0.0403, "lr": 1.6992494832063861e-06, "epoch": 6.043243243243243, "percentage": 60.43, "elapsed_time": "2:49:34", "remaining_time": "1:51:01"} +{"current_steps": 1119, "total_steps": 1850, "loss": 0.1234, "lr": 1.6952289082659568e-06, "epoch": 6.048648648648649, "percentage": 60.49, "elapsed_time": "2:49:36", "remaining_time": "1:50:47"} +{"current_steps": 1120, "total_steps": 1850, "loss": 0.0496, "lr": 1.6912106540765583e-06, "epoch": 6.054054054054054, "percentage": 60.54, "elapsed_time": "2:49:39", "remaining_time": "1:50:34"} +{"current_steps": 1121, "total_steps": 1850, "loss": 0.0725, "lr": 1.6871947322257915e-06, "epoch": 6.059459459459459, "percentage": 60.59, "elapsed_time": "2:49:42", "remaining_time": "1:50:21"} +{"current_steps": 1122, "total_steps": 1850, "loss": 0.0051, "lr": 1.6831811542945342e-06, "epoch": 6.064864864864865, "percentage": 60.65, "elapsed_time": "2:49:46", "remaining_time": "1:50:09"} +{"current_steps": 1123, "total_steps": 1850, "loss": 0.0135, "lr": 1.6791699318569039e-06, "epoch": 6.07027027027027, "percentage": 60.7, "elapsed_time": "2:49:49", "remaining_time": "1:49:56"} +{"current_steps": 1124, "total_steps": 1850, "loss": 0.031, "lr": 1.6751610764802246e-06, "epoch": 6.075675675675676, "percentage": 60.76, "elapsed_time": "2:49:51", "remaining_time": "1:49:43"} +{"current_steps": 1125, "total_steps": 1850, "loss": 0.0257, "lr": 1.6711545997249955e-06, "epoch": 6.081081081081081, "percentage": 60.81, "elapsed_time": "2:49:57", "remaining_time": "1:49:31"} +{"current_steps": 1126, "total_steps": 1850, "loss": 0.0733, "lr": 1.6671505131448562e-06, "epoch": 6.0864864864864865, "percentage": 60.86, "elapsed_time": "2:50:03", "remaining_time": "1:49:20"} +{"current_steps": 1127, "total_steps": 1850, "loss": 0.0113, "lr": 1.6631488282865537e-06, "epoch": 6.091891891891892, "percentage": 60.92, "elapsed_time": "2:50:05", "remaining_time": "1:49:07"} +{"current_steps": 1128, "total_steps": 1850, "loss": 0.0387, "lr": 1.6591495566899084e-06, "epoch": 6.097297297297297, "percentage": 60.97, "elapsed_time": "2:50:11", "remaining_time": "1:48:55"} +{"current_steps": 1129, "total_steps": 1850, "loss": 0.064, "lr": 1.6551527098877824e-06, "epoch": 6.102702702702703, "percentage": 61.03, "elapsed_time": "2:50:14", "remaining_time": "1:48:43"} +{"current_steps": 1130, "total_steps": 1850, "loss": 0.0676, "lr": 1.6511582994060443e-06, "epoch": 6.108108108108108, "percentage": 61.08, "elapsed_time": "2:50:17", "remaining_time": "1:48:30"} +{"current_steps": 1131, "total_steps": 1850, "loss": 0.1557, "lr": 1.6471663367635383e-06, "epoch": 6.113513513513514, "percentage": 61.14, "elapsed_time": "2:50:20", "remaining_time": "1:48:17"} +{"current_steps": 1132, "total_steps": 1850, "loss": 0.0117, "lr": 1.6431768334720486e-06, "epoch": 6.118918918918919, "percentage": 61.19, "elapsed_time": "2:50:24", "remaining_time": "1:48:04"} +{"current_steps": 1133, "total_steps": 1850, "loss": 0.0108, "lr": 1.6391898010362673e-06, "epoch": 6.124324324324324, "percentage": 61.24, "elapsed_time": "2:50:25", "remaining_time": "1:47:50"} +{"current_steps": 1134, "total_steps": 1850, "loss": 0.0165, "lr": 1.6352052509537623e-06, "epoch": 6.12972972972973, "percentage": 61.3, "elapsed_time": "2:50:28", "remaining_time": "1:47:38"} +{"current_steps": 1135, "total_steps": 1850, "loss": 0.0312, "lr": 1.6312231947149416e-06, "epoch": 6.135135135135135, "percentage": 61.35, "elapsed_time": "2:50:33", "remaining_time": "1:47:26"} +{"current_steps": 1136, "total_steps": 1850, "loss": 0.1021, "lr": 1.627243643803022e-06, "epoch": 6.140540540540541, "percentage": 61.41, "elapsed_time": "2:50:37", "remaining_time": "1:47:14"} +{"current_steps": 1137, "total_steps": 1850, "loss": 0.0387, "lr": 1.623266609693997e-06, "epoch": 6.145945945945946, "percentage": 61.46, "elapsed_time": "2:50:38", "remaining_time": "1:47:00"} +{"current_steps": 1138, "total_steps": 1850, "loss": 0.0973, "lr": 1.6192921038565993e-06, "epoch": 6.151351351351352, "percentage": 61.51, "elapsed_time": "2:50:41", "remaining_time": "1:46:47"} +{"current_steps": 1139, "total_steps": 1850, "loss": 0.0175, "lr": 1.615320137752274e-06, "epoch": 6.1567567567567565, "percentage": 61.57, "elapsed_time": "2:50:47", "remaining_time": "1:46:36"} +{"current_steps": 1140, "total_steps": 1850, "loss": 0.0072, "lr": 1.6113507228351411e-06, "epoch": 6.162162162162162, "percentage": 61.62, "elapsed_time": "2:50:49", "remaining_time": "1:46:23"} +{"current_steps": 1141, "total_steps": 1850, "loss": 0.0279, "lr": 1.6073838705519618e-06, "epoch": 6.167567567567567, "percentage": 61.68, "elapsed_time": "2:50:52", "remaining_time": "1:46:10"} +{"current_steps": 1142, "total_steps": 1850, "loss": 0.0171, "lr": 1.6034195923421106e-06, "epoch": 6.172972972972973, "percentage": 61.73, "elapsed_time": "2:50:56", "remaining_time": "1:45:58"} +{"current_steps": 1143, "total_steps": 1850, "loss": 0.0394, "lr": 1.5994578996375365e-06, "epoch": 6.178378378378379, "percentage": 61.78, "elapsed_time": "2:50:59", "remaining_time": "1:45:45"} +{"current_steps": 1144, "total_steps": 1850, "loss": 0.0218, "lr": 1.5954988038627327e-06, "epoch": 6.183783783783784, "percentage": 61.84, "elapsed_time": "2:51:01", "remaining_time": "1:45:32"} +{"current_steps": 1145, "total_steps": 1850, "loss": 0.0119, "lr": 1.5915423164347055e-06, "epoch": 6.1891891891891895, "percentage": 61.89, "elapsed_time": "2:51:03", "remaining_time": "1:45:19"} +{"current_steps": 1146, "total_steps": 1850, "loss": 0.0224, "lr": 1.5875884487629373e-06, "epoch": 6.194594594594594, "percentage": 61.95, "elapsed_time": "2:51:06", "remaining_time": "1:45:06"} +{"current_steps": 1147, "total_steps": 1850, "loss": 0.0156, "lr": 1.583637212249357e-06, "epoch": 6.2, "percentage": 62.0, "elapsed_time": "2:51:08", "remaining_time": "1:44:53"} +{"current_steps": 1148, "total_steps": 1850, "loss": 0.0761, "lr": 1.5796886182883053e-06, "epoch": 6.205405405405405, "percentage": 62.05, "elapsed_time": "2:51:11", "remaining_time": "1:44:41"} +{"current_steps": 1149, "total_steps": 1850, "loss": 0.1198, "lr": 1.575742678266503e-06, "epoch": 6.210810810810811, "percentage": 62.11, "elapsed_time": "2:51:15", "remaining_time": "1:44:29"} +{"current_steps": 1150, "total_steps": 1850, "loss": 0.0242, "lr": 1.5717994035630175e-06, "epoch": 6.216216216216216, "percentage": 62.16, "elapsed_time": "2:51:18", "remaining_time": "1:44:16"} +{"current_steps": 1151, "total_steps": 1850, "loss": 0.032, "lr": 1.5678588055492289e-06, "epoch": 6.221621621621622, "percentage": 62.22, "elapsed_time": "2:51:24", "remaining_time": "1:44:05"} +{"current_steps": 1152, "total_steps": 1850, "loss": 0.0376, "lr": 1.5639208955888008e-06, "epoch": 6.227027027027027, "percentage": 62.27, "elapsed_time": "2:51:30", "remaining_time": "1:43:54"} +{"current_steps": 1153, "total_steps": 1850, "loss": 0.0203, "lr": 1.5599856850376427e-06, "epoch": 6.232432432432432, "percentage": 62.32, "elapsed_time": "2:51:33", "remaining_time": "1:43:42"} +{"current_steps": 1154, "total_steps": 1850, "loss": 0.0316, "lr": 1.556053185243882e-06, "epoch": 6.237837837837838, "percentage": 62.38, "elapsed_time": "2:51:35", "remaining_time": "1:43:29"} +{"current_steps": 1155, "total_steps": 1850, "loss": 0.0068, "lr": 1.5521234075478264e-06, "epoch": 6.243243243243243, "percentage": 62.43, "elapsed_time": "2:51:38", "remaining_time": "1:43:16"} +{"current_steps": 1156, "total_steps": 1850, "loss": 0.0104, "lr": 1.548196363281937e-06, "epoch": 6.248648648648649, "percentage": 62.49, "elapsed_time": "2:51:39", "remaining_time": "1:43:03"} +{"current_steps": 1157, "total_steps": 1850, "loss": 0.0146, "lr": 1.5442720637707891e-06, "epoch": 6.254054054054054, "percentage": 62.54, "elapsed_time": "2:51:41", "remaining_time": "1:42:50"} +{"current_steps": 1158, "total_steps": 1850, "loss": 0.0298, "lr": 1.5403505203310442e-06, "epoch": 6.2594594594594595, "percentage": 62.59, "elapsed_time": "2:51:43", "remaining_time": "1:42:37"} +{"current_steps": 1159, "total_steps": 1850, "loss": 0.0238, "lr": 1.536431744271417e-06, "epoch": 6.264864864864865, "percentage": 62.65, "elapsed_time": "2:51:48", "remaining_time": "1:42:25"} +{"current_steps": 1160, "total_steps": 1850, "loss": 0.0125, "lr": 1.5325157468926415e-06, "epoch": 6.27027027027027, "percentage": 62.7, "elapsed_time": "2:51:50", "remaining_time": "1:42:13"} +{"current_steps": 1161, "total_steps": 1850, "loss": 0.0557, "lr": 1.5286025394874366e-06, "epoch": 6.275675675675676, "percentage": 62.76, "elapsed_time": "2:51:54", "remaining_time": "1:42:00"} +{"current_steps": 1162, "total_steps": 1850, "loss": 0.0271, "lr": 1.5246921333404786e-06, "epoch": 6.281081081081081, "percentage": 62.81, "elapsed_time": "2:51:56", "remaining_time": "1:41:47"} +{"current_steps": 1163, "total_steps": 1850, "loss": 0.0209, "lr": 1.520784539728363e-06, "epoch": 6.286486486486487, "percentage": 62.86, "elapsed_time": "2:52:03", "remaining_time": "1:41:38"} +{"current_steps": 1164, "total_steps": 1850, "loss": 0.0683, "lr": 1.5168797699195765e-06, "epoch": 6.291891891891892, "percentage": 62.92, "elapsed_time": "2:52:06", "remaining_time": "1:41:25"} +{"current_steps": 1165, "total_steps": 1850, "loss": 0.0881, "lr": 1.5129778351744622e-06, "epoch": 6.297297297297297, "percentage": 62.97, "elapsed_time": "2:52:09", "remaining_time": "1:41:13"} +{"current_steps": 1166, "total_steps": 1850, "loss": 0.0093, "lr": 1.5090787467451873e-06, "epoch": 6.302702702702703, "percentage": 63.03, "elapsed_time": "2:52:10", "remaining_time": "1:40:59"} +{"current_steps": 1167, "total_steps": 1850, "loss": 0.054, "lr": 1.5051825158757116e-06, "epoch": 6.308108108108108, "percentage": 63.08, "elapsed_time": "2:52:12", "remaining_time": "1:40:47"} +{"current_steps": 1168, "total_steps": 1850, "loss": 0.0498, "lr": 1.5012891538017538e-06, "epoch": 6.313513513513514, "percentage": 63.14, "elapsed_time": "2:52:18", "remaining_time": "1:40:36"} +{"current_steps": 1169, "total_steps": 1850, "loss": 0.0601, "lr": 1.49739867175076e-06, "epoch": 6.318918918918919, "percentage": 63.19, "elapsed_time": "2:52:21", "remaining_time": "1:40:24"} +{"current_steps": 1170, "total_steps": 1850, "loss": 0.0119, "lr": 1.4935110809418713e-06, "epoch": 6.324324324324325, "percentage": 63.24, "elapsed_time": "2:52:25", "remaining_time": "1:40:12"} +{"current_steps": 1171, "total_steps": 1850, "loss": 0.1361, "lr": 1.4896263925858903e-06, "epoch": 6.3297297297297295, "percentage": 63.3, "elapsed_time": "2:52:27", "remaining_time": "1:40:00"} +{"current_steps": 1172, "total_steps": 1850, "loss": 0.0738, "lr": 1.485744617885251e-06, "epoch": 6.335135135135135, "percentage": 63.35, "elapsed_time": "2:52:30", "remaining_time": "1:39:47"} +{"current_steps": 1173, "total_steps": 1850, "loss": 0.0392, "lr": 1.481865768033984e-06, "epoch": 6.34054054054054, "percentage": 63.41, "elapsed_time": "2:52:32", "remaining_time": "1:39:34"} +{"current_steps": 1174, "total_steps": 1850, "loss": 0.0158, "lr": 1.4779898542176865e-06, "epoch": 6.345945945945946, "percentage": 63.46, "elapsed_time": "2:52:36", "remaining_time": "1:39:23"} +{"current_steps": 1175, "total_steps": 1850, "loss": 0.0552, "lr": 1.4741168876134875e-06, "epoch": 6.351351351351352, "percentage": 63.51, "elapsed_time": "2:52:39", "remaining_time": "1:39:11"} +{"current_steps": 1176, "total_steps": 1850, "loss": 0.039, "lr": 1.4702468793900187e-06, "epoch": 6.356756756756757, "percentage": 63.57, "elapsed_time": "2:52:40", "remaining_time": "1:38:57"} +{"current_steps": 1177, "total_steps": 1850, "loss": 0.008, "lr": 1.4663798407073799e-06, "epoch": 6.3621621621621625, "percentage": 63.62, "elapsed_time": "2:52:42", "remaining_time": "1:38:45"} +{"current_steps": 1178, "total_steps": 1850, "loss": 0.0105, "lr": 1.4625157827171056e-06, "epoch": 6.367567567567567, "percentage": 63.68, "elapsed_time": "2:52:43", "remaining_time": "1:38:32"} +{"current_steps": 1179, "total_steps": 1850, "loss": 0.0161, "lr": 1.4586547165621385e-06, "epoch": 6.372972972972973, "percentage": 63.73, "elapsed_time": "2:52:45", "remaining_time": "1:38:19"} +{"current_steps": 1180, "total_steps": 1850, "loss": 0.0319, "lr": 1.4547966533767904e-06, "epoch": 6.378378378378378, "percentage": 63.78, "elapsed_time": "2:52:48", "remaining_time": "1:38:07"} +{"current_steps": 1181, "total_steps": 1850, "loss": 0.0389, "lr": 1.450941604286715e-06, "epoch": 6.383783783783784, "percentage": 63.84, "elapsed_time": "2:52:55", "remaining_time": "1:37:57"} +{"current_steps": 1182, "total_steps": 1850, "loss": 0.0585, "lr": 1.4470895804088736e-06, "epoch": 6.389189189189189, "percentage": 63.89, "elapsed_time": "2:52:58", "remaining_time": "1:37:45"} +{"current_steps": 1183, "total_steps": 1850, "loss": 0.011, "lr": 1.443240592851505e-06, "epoch": 6.394594594594595, "percentage": 63.95, "elapsed_time": "2:53:02", "remaining_time": "1:37:33"} +{"current_steps": 1184, "total_steps": 1850, "loss": 0.0237, "lr": 1.4393946527140884e-06, "epoch": 6.4, "percentage": 64.0, "elapsed_time": "2:53:03", "remaining_time": "1:37:20"} +{"current_steps": 1185, "total_steps": 1850, "loss": 0.047, "lr": 1.4355517710873184e-06, "epoch": 6.405405405405405, "percentage": 64.05, "elapsed_time": "2:53:06", "remaining_time": "1:37:08"} +{"current_steps": 1186, "total_steps": 1850, "loss": 0.0315, "lr": 1.4317119590530692e-06, "epoch": 6.410810810810811, "percentage": 64.11, "elapsed_time": "2:53:11", "remaining_time": "1:36:57"} +{"current_steps": 1187, "total_steps": 1850, "loss": 0.0301, "lr": 1.427875227684361e-06, "epoch": 6.416216216216216, "percentage": 64.16, "elapsed_time": "2:53:13", "remaining_time": "1:36:45"} +{"current_steps": 1188, "total_steps": 1850, "loss": 0.0357, "lr": 1.4240415880453327e-06, "epoch": 6.421621621621622, "percentage": 64.22, "elapsed_time": "2:53:16", "remaining_time": "1:36:33"} +{"current_steps": 1189, "total_steps": 1850, "loss": 0.0397, "lr": 1.420211051191206e-06, "epoch": 6.427027027027027, "percentage": 64.27, "elapsed_time": "2:53:18", "remaining_time": "1:36:20"} +{"current_steps": 1190, "total_steps": 1850, "loss": 0.0092, "lr": 1.4163836281682563e-06, "epoch": 6.4324324324324325, "percentage": 64.32, "elapsed_time": "2:53:20", "remaining_time": "1:36:08"} +{"current_steps": 1191, "total_steps": 1850, "loss": 0.106, "lr": 1.4125593300137767e-06, "epoch": 6.437837837837838, "percentage": 64.38, "elapsed_time": "2:53:23", "remaining_time": "1:35:56"} +{"current_steps": 1192, "total_steps": 1850, "loss": 0.0087, "lr": 1.4087381677560519e-06, "epoch": 6.443243243243243, "percentage": 64.43, "elapsed_time": "2:53:24", "remaining_time": "1:35:43"} +{"current_steps": 1193, "total_steps": 1850, "loss": 0.016, "lr": 1.4049201524143236e-06, "epoch": 6.448648648648649, "percentage": 64.49, "elapsed_time": "2:53:27", "remaining_time": "1:35:31"} +{"current_steps": 1194, "total_steps": 1850, "loss": 0.0065, "lr": 1.401105294998755e-06, "epoch": 6.454054054054054, "percentage": 64.54, "elapsed_time": "2:53:29", "remaining_time": "1:35:19"} +{"current_steps": 1195, "total_steps": 1850, "loss": 0.0248, "lr": 1.3972936065104064e-06, "epoch": 6.45945945945946, "percentage": 64.59, "elapsed_time": "2:53:34", "remaining_time": "1:35:08"} +{"current_steps": 1196, "total_steps": 1850, "loss": 0.0436, "lr": 1.393485097941199e-06, "epoch": 6.464864864864865, "percentage": 64.65, "elapsed_time": "2:53:36", "remaining_time": "1:34:56"} +{"current_steps": 1197, "total_steps": 1850, "loss": 0.0755, "lr": 1.3896797802738832e-06, "epoch": 6.47027027027027, "percentage": 64.7, "elapsed_time": "2:53:38", "remaining_time": "1:34:43"} +{"current_steps": 1198, "total_steps": 1850, "loss": 0.0194, "lr": 1.385877664482006e-06, "epoch": 6.475675675675676, "percentage": 64.76, "elapsed_time": "2:53:39", "remaining_time": "1:34:30"} +{"current_steps": 1199, "total_steps": 1850, "loss": 0.0416, "lr": 1.382078761529886e-06, "epoch": 6.481081081081081, "percentage": 64.81, "elapsed_time": "2:53:44", "remaining_time": "1:34:19"} +{"current_steps": 1200, "total_steps": 1850, "loss": 0.0678, "lr": 1.3782830823725713e-06, "epoch": 6.486486486486487, "percentage": 64.86, "elapsed_time": "2:53:47", "remaining_time": "1:34:08"} +{"current_steps": 1201, "total_steps": 1850, "loss": 0.0073, "lr": 1.3744906379558165e-06, "epoch": 6.491891891891892, "percentage": 64.92, "elapsed_time": "2:53:50", "remaining_time": "1:33:56"} +{"current_steps": 1202, "total_steps": 1850, "loss": 0.0436, "lr": 1.3707014392160477e-06, "epoch": 6.4972972972972975, "percentage": 64.97, "elapsed_time": "2:53:53", "remaining_time": "1:33:44"} +{"current_steps": 1203, "total_steps": 1850, "loss": 0.0087, "lr": 1.3669154970803313e-06, "epoch": 6.5027027027027025, "percentage": 65.03, "elapsed_time": "2:53:55", "remaining_time": "1:33:32"} +{"current_steps": 1204, "total_steps": 1850, "loss": 0.0483, "lr": 1.363132822466341e-06, "epoch": 6.508108108108108, "percentage": 65.08, "elapsed_time": "2:53:57", "remaining_time": "1:33:20"} +{"current_steps": 1205, "total_steps": 1850, "loss": 0.0147, "lr": 1.3593534262823289e-06, "epoch": 6.513513513513513, "percentage": 65.14, "elapsed_time": "2:53:59", "remaining_time": "1:33:08"} +{"current_steps": 1206, "total_steps": 1850, "loss": 0.0107, "lr": 1.355577319427095e-06, "epoch": 6.518918918918919, "percentage": 65.19, "elapsed_time": "2:54:01", "remaining_time": "1:32:55"} +{"current_steps": 1207, "total_steps": 1850, "loss": 0.0442, "lr": 1.3518045127899493e-06, "epoch": 6.524324324324324, "percentage": 65.24, "elapsed_time": "2:54:04", "remaining_time": "1:32:44"} +{"current_steps": 1208, "total_steps": 1850, "loss": 0.047, "lr": 1.3480350172506884e-06, "epoch": 6.52972972972973, "percentage": 65.3, "elapsed_time": "2:54:08", "remaining_time": "1:32:32"} +{"current_steps": 1209, "total_steps": 1850, "loss": 0.0341, "lr": 1.3442688436795592e-06, "epoch": 6.535135135135135, "percentage": 65.35, "elapsed_time": "2:54:10", "remaining_time": "1:32:20"} +{"current_steps": 1210, "total_steps": 1850, "loss": 0.0082, "lr": 1.3405060029372308e-06, "epoch": 6.54054054054054, "percentage": 65.41, "elapsed_time": "2:54:12", "remaining_time": "1:32:08"} +{"current_steps": 1211, "total_steps": 1850, "loss": 0.0223, "lr": 1.3367465058747566e-06, "epoch": 6.545945945945946, "percentage": 65.46, "elapsed_time": "2:54:13", "remaining_time": "1:31:55"} +{"current_steps": 1212, "total_steps": 1850, "loss": 0.011, "lr": 1.3329903633335528e-06, "epoch": 6.551351351351351, "percentage": 65.51, "elapsed_time": "2:54:15", "remaining_time": "1:31:43"} +{"current_steps": 1213, "total_steps": 1850, "loss": 0.0208, "lr": 1.3292375861453598e-06, "epoch": 6.556756756756757, "percentage": 65.57, "elapsed_time": "2:54:20", "remaining_time": "1:31:33"} +{"current_steps": 1214, "total_steps": 1850, "loss": 0.023, "lr": 1.3254881851322126e-06, "epoch": 6.562162162162162, "percentage": 65.62, "elapsed_time": "2:54:21", "remaining_time": "1:31:20"} +{"current_steps": 1215, "total_steps": 1850, "loss": 0.019, "lr": 1.3217421711064112e-06, "epoch": 6.5675675675675675, "percentage": 65.68, "elapsed_time": "2:54:24", "remaining_time": "1:31:09"} +{"current_steps": 1216, "total_steps": 1850, "loss": 0.0803, "lr": 1.3179995548704883e-06, "epoch": 6.572972972972973, "percentage": 65.73, "elapsed_time": "2:54:30", "remaining_time": "1:30:59"} +{"current_steps": 1217, "total_steps": 1850, "loss": 0.0941, "lr": 1.314260347217179e-06, "epoch": 6.578378378378378, "percentage": 65.78, "elapsed_time": "2:54:32", "remaining_time": "1:30:47"} +{"current_steps": 1218, "total_steps": 1850, "loss": 0.0516, "lr": 1.3105245589293852e-06, "epoch": 6.583783783783784, "percentage": 65.84, "elapsed_time": "2:54:35", "remaining_time": "1:30:35"} +{"current_steps": 1219, "total_steps": 1850, "loss": 0.0603, "lr": 1.3067922007801548e-06, "epoch": 6.589189189189189, "percentage": 65.89, "elapsed_time": "2:54:41", "remaining_time": "1:30:25"} +{"current_steps": 1220, "total_steps": 1850, "loss": 0.0797, "lr": 1.3030632835326378e-06, "epoch": 6.594594594594595, "percentage": 65.95, "elapsed_time": "2:54:45", "remaining_time": "1:30:14"} +{"current_steps": 1221, "total_steps": 1850, "loss": 0.0316, "lr": 1.2993378179400645e-06, "epoch": 6.6, "percentage": 66.0, "elapsed_time": "2:54:50", "remaining_time": "1:30:04"} +{"current_steps": 1222, "total_steps": 1850, "loss": 0.0553, "lr": 1.2956158147457116e-06, "epoch": 6.605405405405405, "percentage": 66.05, "elapsed_time": "2:54:54", "remaining_time": "1:29:53"} +{"current_steps": 1223, "total_steps": 1850, "loss": 0.0438, "lr": 1.2918972846828711e-06, "epoch": 6.610810810810811, "percentage": 66.11, "elapsed_time": "2:54:59", "remaining_time": "1:29:42"} +{"current_steps": 1224, "total_steps": 1850, "loss": 0.0781, "lr": 1.2881822384748176e-06, "epoch": 6.616216216216216, "percentage": 66.16, "elapsed_time": "2:55:01", "remaining_time": "1:29:30"} +{"current_steps": 1225, "total_steps": 1850, "loss": 0.3345, "lr": 1.2844706868347812e-06, "epoch": 6.621621621621622, "percentage": 66.22, "elapsed_time": "2:55:05", "remaining_time": "1:29:19"} +{"current_steps": 1226, "total_steps": 1850, "loss": 0.0915, "lr": 1.2807626404659144e-06, "epoch": 6.627027027027027, "percentage": 66.27, "elapsed_time": "2:55:08", "remaining_time": "1:29:08"} +{"current_steps": 1227, "total_steps": 1850, "loss": 0.0882, "lr": 1.2770581100612594e-06, "epoch": 6.632432432432433, "percentage": 66.32, "elapsed_time": "2:55:10", "remaining_time": "1:28:56"} +{"current_steps": 1228, "total_steps": 1850, "loss": 0.0297, "lr": 1.2733571063037214e-06, "epoch": 6.6378378378378375, "percentage": 66.38, "elapsed_time": "2:55:15", "remaining_time": "1:28:46"} +{"current_steps": 1229, "total_steps": 1850, "loss": 0.0506, "lr": 1.2696596398660358e-06, "epoch": 6.643243243243243, "percentage": 66.43, "elapsed_time": "2:55:16", "remaining_time": "1:28:33"} +{"current_steps": 1230, "total_steps": 1850, "loss": 0.0794, "lr": 1.2659657214107365e-06, "epoch": 6.648648648648649, "percentage": 66.49, "elapsed_time": "2:55:18", "remaining_time": "1:28:21"} +{"current_steps": 1231, "total_steps": 1850, "loss": 0.0145, "lr": 1.2622753615901245e-06, "epoch": 6.654054054054054, "percentage": 66.54, "elapsed_time": "2:55:22", "remaining_time": "1:28:11"} +{"current_steps": 1232, "total_steps": 1850, "loss": 0.0064, "lr": 1.2585885710462409e-06, "epoch": 6.65945945945946, "percentage": 66.59, "elapsed_time": "2:55:23", "remaining_time": "1:27:58"} +{"current_steps": 1233, "total_steps": 1850, "loss": 0.0765, "lr": 1.254905360410834e-06, "epoch": 6.664864864864865, "percentage": 66.65, "elapsed_time": "2:55:26", "remaining_time": "1:27:47"} +{"current_steps": 1234, "total_steps": 1850, "loss": 0.0092, "lr": 1.2512257403053257e-06, "epoch": 6.6702702702702705, "percentage": 66.7, "elapsed_time": "2:55:29", "remaining_time": "1:27:36"} +{"current_steps": 1235, "total_steps": 1850, "loss": 0.0183, "lr": 1.247549721340787e-06, "epoch": 6.675675675675675, "percentage": 66.76, "elapsed_time": "2:55:31", "remaining_time": "1:27:24"} +{"current_steps": 1236, "total_steps": 1850, "loss": 0.018, "lr": 1.2438773141179025e-06, "epoch": 6.681081081081081, "percentage": 66.81, "elapsed_time": "2:55:33", "remaining_time": "1:27:12"} +{"current_steps": 1237, "total_steps": 1850, "loss": 0.0123, "lr": 1.2402085292269427e-06, "epoch": 6.686486486486486, "percentage": 66.86, "elapsed_time": "2:55:37", "remaining_time": "1:27:01"} +{"current_steps": 1238, "total_steps": 1850, "loss": 0.0084, "lr": 1.236543377247729e-06, "epoch": 6.691891891891892, "percentage": 66.92, "elapsed_time": "2:55:41", "remaining_time": "1:26:51"} +{"current_steps": 1239, "total_steps": 1850, "loss": 0.009, "lr": 1.232881868749611e-06, "epoch": 6.697297297297297, "percentage": 66.97, "elapsed_time": "2:55:44", "remaining_time": "1:26:39"} +{"current_steps": 1240, "total_steps": 1850, "loss": 0.0087, "lr": 1.22922401429143e-06, "epoch": 6.702702702702703, "percentage": 67.03, "elapsed_time": "2:55:46", "remaining_time": "1:26:28"} +{"current_steps": 1241, "total_steps": 1850, "loss": 0.0331, "lr": 1.2255698244214863e-06, "epoch": 6.708108108108108, "percentage": 67.08, "elapsed_time": "2:55:48", "remaining_time": "1:26:16"} +{"current_steps": 1242, "total_steps": 1850, "loss": 0.0326, "lr": 1.2219193096775173e-06, "epoch": 6.713513513513513, "percentage": 67.14, "elapsed_time": "2:55:55", "remaining_time": "1:26:07"} +{"current_steps": 1243, "total_steps": 1850, "loss": 0.0088, "lr": 1.218272480586661e-06, "epoch": 6.718918918918919, "percentage": 67.19, "elapsed_time": "2:55:59", "remaining_time": "1:25:56"} +{"current_steps": 1244, "total_steps": 1850, "loss": 0.0529, "lr": 1.2146293476654242e-06, "epoch": 6.724324324324324, "percentage": 67.24, "elapsed_time": "2:56:06", "remaining_time": "1:25:47"} +{"current_steps": 1245, "total_steps": 1850, "loss": 0.0589, "lr": 1.2109899214196583e-06, "epoch": 6.72972972972973, "percentage": 67.3, "elapsed_time": "2:56:08", "remaining_time": "1:25:35"} +{"current_steps": 1246, "total_steps": 1850, "loss": 0.0138, "lr": 1.2073542123445239e-06, "epoch": 6.735135135135135, "percentage": 67.35, "elapsed_time": "2:56:10", "remaining_time": "1:25:24"} +{"current_steps": 1247, "total_steps": 1850, "loss": 0.0266, "lr": 1.2037222309244642e-06, "epoch": 6.7405405405405405, "percentage": 67.41, "elapsed_time": "2:56:12", "remaining_time": "1:25:12"} +{"current_steps": 1248, "total_steps": 1850, "loss": 0.0334, "lr": 1.200093987633169e-06, "epoch": 6.745945945945946, "percentage": 67.46, "elapsed_time": "2:56:14", "remaining_time": "1:25:00"} +{"current_steps": 1249, "total_steps": 1850, "loss": 0.0503, "lr": 1.1964694929335518e-06, "epoch": 6.751351351351351, "percentage": 67.51, "elapsed_time": "2:56:17", "remaining_time": "1:24:49"} +{"current_steps": 1250, "total_steps": 1850, "loss": 0.0672, "lr": 1.1928487572777158e-06, "epoch": 6.756756756756757, "percentage": 67.57, "elapsed_time": "2:56:20", "remaining_time": "1:24:38"} +{"current_steps": 1251, "total_steps": 1850, "loss": 0.1029, "lr": 1.1892317911069212e-06, "epoch": 6.762162162162162, "percentage": 67.62, "elapsed_time": "2:56:23", "remaining_time": "1:24:27"} +{"current_steps": 1252, "total_steps": 1850, "loss": 0.0151, "lr": 1.185618604851561e-06, "epoch": 6.767567567567568, "percentage": 67.68, "elapsed_time": "2:56:25", "remaining_time": "1:24:15"} +{"current_steps": 1253, "total_steps": 1850, "loss": 0.0179, "lr": 1.182009208931128e-06, "epoch": 6.772972972972973, "percentage": 67.73, "elapsed_time": "2:56:27", "remaining_time": "1:24:04"} +{"current_steps": 1254, "total_steps": 1850, "loss": 0.0079, "lr": 1.178403613754182e-06, "epoch": 6.778378378378378, "percentage": 67.78, "elapsed_time": "2:56:27", "remaining_time": "1:23:52"} +{"current_steps": 1255, "total_steps": 1850, "loss": 0.0194, "lr": 1.1748018297183239e-06, "epoch": 6.783783783783784, "percentage": 67.84, "elapsed_time": "2:56:31", "remaining_time": "1:23:41"} +{"current_steps": 1256, "total_steps": 1850, "loss": 0.024, "lr": 1.1712038672101654e-06, "epoch": 6.789189189189189, "percentage": 67.89, "elapsed_time": "2:56:37", "remaining_time": "1:23:31"} +{"current_steps": 1257, "total_steps": 1850, "loss": 0.1671, "lr": 1.1676097366052974e-06, "epoch": 6.794594594594595, "percentage": 67.95, "elapsed_time": "2:56:39", "remaining_time": "1:23:20"} +{"current_steps": 1258, "total_steps": 1850, "loss": 0.0644, "lr": 1.1640194482682573e-06, "epoch": 6.8, "percentage": 68.0, "elapsed_time": "2:56:42", "remaining_time": "1:23:09"} +{"current_steps": 1259, "total_steps": 1850, "loss": 0.0589, "lr": 1.160433012552508e-06, "epoch": 6.805405405405406, "percentage": 68.05, "elapsed_time": "2:56:45", "remaining_time": "1:22:58"} +{"current_steps": 1260, "total_steps": 1850, "loss": 0.161, "lr": 1.1568504398003995e-06, "epoch": 6.8108108108108105, "percentage": 68.11, "elapsed_time": "2:56:50", "remaining_time": "1:22:48"} +{"current_steps": 1261, "total_steps": 1850, "loss": 0.0429, "lr": 1.1532717403431405e-06, "epoch": 6.816216216216216, "percentage": 68.16, "elapsed_time": "2:56:55", "remaining_time": "1:22:38"} +{"current_steps": 1262, "total_steps": 1850, "loss": 0.0568, "lr": 1.1496969245007723e-06, "epoch": 6.821621621621622, "percentage": 68.22, "elapsed_time": "2:57:02", "remaining_time": "1:22:29"} +{"current_steps": 1263, "total_steps": 1850, "loss": 0.1732, "lr": 1.1461260025821374e-06, "epoch": 6.827027027027027, "percentage": 68.27, "elapsed_time": "2:57:05", "remaining_time": "1:22:18"} +{"current_steps": 1264, "total_steps": 1850, "loss": 0.0105, "lr": 1.1425589848848464e-06, "epoch": 6.832432432432433, "percentage": 68.32, "elapsed_time": "2:57:07", "remaining_time": "1:22:06"} +{"current_steps": 1265, "total_steps": 1850, "loss": 0.0379, "lr": 1.1389958816952538e-06, "epoch": 6.837837837837838, "percentage": 68.38, "elapsed_time": "2:57:09", "remaining_time": "1:21:55"} +{"current_steps": 1266, "total_steps": 1850, "loss": 0.0409, "lr": 1.1354367032884245e-06, "epoch": 6.8432432432432435, "percentage": 68.43, "elapsed_time": "2:57:12", "remaining_time": "1:21:44"} +{"current_steps": 1267, "total_steps": 1850, "loss": 0.0282, "lr": 1.131881459928107e-06, "epoch": 6.848648648648648, "percentage": 68.49, "elapsed_time": "2:57:15", "remaining_time": "1:21:34"} +{"current_steps": 1268, "total_steps": 1850, "loss": 0.0177, "lr": 1.128330161866698e-06, "epoch": 6.854054054054054, "percentage": 68.54, "elapsed_time": "2:57:18", "remaining_time": "1:21:23"} +{"current_steps": 1269, "total_steps": 1850, "loss": 0.0112, "lr": 1.1247828193452215e-06, "epoch": 6.859459459459459, "percentage": 68.59, "elapsed_time": "2:57:22", "remaining_time": "1:21:12"} +{"current_steps": 1270, "total_steps": 1850, "loss": 0.0496, "lr": 1.1212394425932937e-06, "epoch": 6.864864864864865, "percentage": 68.65, "elapsed_time": "2:57:24", "remaining_time": "1:21:01"} +{"current_steps": 1271, "total_steps": 1850, "loss": 0.0737, "lr": 1.1177000418290917e-06, "epoch": 6.87027027027027, "percentage": 68.7, "elapsed_time": "2:57:29", "remaining_time": "1:20:51"} +{"current_steps": 1272, "total_steps": 1850, "loss": 0.0204, "lr": 1.1141646272593303e-06, "epoch": 6.875675675675676, "percentage": 68.76, "elapsed_time": "2:57:32", "remaining_time": "1:20:40"} +{"current_steps": 1273, "total_steps": 1850, "loss": 0.0126, "lr": 1.1106332090792273e-06, "epoch": 6.881081081081081, "percentage": 68.81, "elapsed_time": "2:57:33", "remaining_time": "1:20:28"} +{"current_steps": 1274, "total_steps": 1850, "loss": 0.0451, "lr": 1.1071057974724783e-06, "epoch": 6.886486486486486, "percentage": 68.86, "elapsed_time": "2:57:37", "remaining_time": "1:20:18"} +{"current_steps": 1275, "total_steps": 1850, "loss": 0.0876, "lr": 1.1035824026112205e-06, "epoch": 6.891891891891892, "percentage": 68.92, "elapsed_time": "2:57:43", "remaining_time": "1:20:09"} +{"current_steps": 1276, "total_steps": 1850, "loss": 0.017, "lr": 1.1000630346560118e-06, "epoch": 6.897297297297297, "percentage": 68.97, "elapsed_time": "2:57:46", "remaining_time": "1:19:58"} +{"current_steps": 1277, "total_steps": 1850, "loss": 0.0199, "lr": 1.0965477037557973e-06, "epoch": 6.902702702702703, "percentage": 69.03, "elapsed_time": "2:57:52", "remaining_time": "1:19:48"} +{"current_steps": 1278, "total_steps": 1850, "loss": 0.0154, "lr": 1.093036420047876e-06, "epoch": 6.908108108108108, "percentage": 69.08, "elapsed_time": "2:57:54", "remaining_time": "1:19:37"} +{"current_steps": 1279, "total_steps": 1850, "loss": 0.0101, "lr": 1.0895291936578825e-06, "epoch": 6.9135135135135135, "percentage": 69.14, "elapsed_time": "2:57:58", "remaining_time": "1:19:27"} +{"current_steps": 1280, "total_steps": 1850, "loss": 0.0145, "lr": 1.0860260346997475e-06, "epoch": 6.918918918918919, "percentage": 69.19, "elapsed_time": "2:58:03", "remaining_time": "1:19:17"} +{"current_steps": 1281, "total_steps": 1850, "loss": 0.0616, "lr": 1.0825269532756707e-06, "epoch": 6.924324324324324, "percentage": 69.24, "elapsed_time": "2:58:05", "remaining_time": "1:19:06"} +{"current_steps": 1282, "total_steps": 1850, "loss": 0.0563, "lr": 1.079031959476096e-06, "epoch": 6.92972972972973, "percentage": 69.3, "elapsed_time": "2:58:12", "remaining_time": "1:18:57"} +{"current_steps": 1283, "total_steps": 1850, "loss": 0.0091, "lr": 1.0755410633796799e-06, "epoch": 6.935135135135135, "percentage": 69.35, "elapsed_time": "2:58:12", "remaining_time": "1:18:45"} +{"current_steps": 1284, "total_steps": 1850, "loss": 0.0241, "lr": 1.0720542750532584e-06, "epoch": 6.940540540540541, "percentage": 69.41, "elapsed_time": "2:58:13", "remaining_time": "1:18:34"} +{"current_steps": 1285, "total_steps": 1850, "loss": 0.0289, "lr": 1.0685716045518262e-06, "epoch": 6.945945945945946, "percentage": 69.46, "elapsed_time": "2:58:15", "remaining_time": "1:18:22"} +{"current_steps": 1286, "total_steps": 1850, "loss": 0.0827, "lr": 1.065093061918501e-06, "epoch": 6.951351351351351, "percentage": 69.51, "elapsed_time": "2:58:20", "remaining_time": "1:18:12"} +{"current_steps": 1287, "total_steps": 1850, "loss": 0.0307, "lr": 1.0616186571844983e-06, "epoch": 6.956756756756757, "percentage": 69.57, "elapsed_time": "2:58:21", "remaining_time": "1:18:01"} +{"current_steps": 1288, "total_steps": 1850, "loss": 0.1038, "lr": 1.058148400369098e-06, "epoch": 6.962162162162162, "percentage": 69.62, "elapsed_time": "2:58:24", "remaining_time": "1:17:50"} +{"current_steps": 1289, "total_steps": 1850, "loss": 0.0097, "lr": 1.0546823014796215e-06, "epoch": 6.967567567567568, "percentage": 69.68, "elapsed_time": "2:58:27", "remaining_time": "1:17:40"} +{"current_steps": 1290, "total_steps": 1850, "loss": 0.01, "lr": 1.051220370511399e-06, "epoch": 6.972972972972973, "percentage": 69.73, "elapsed_time": "2:58:30", "remaining_time": "1:17:29"}