diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4e5f96ed7d7d43a4b79695449ceb58ca95fcdfe5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +wandb/ diff --git a/checkpoint-16500/config.json b/checkpoint-16500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-16500/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-16500/experiment_cfg/metadata.json b/checkpoint-16500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-16500/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-16500/model-00001-of-00002.safetensors b/checkpoint-16500/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f4671d68342d99e0ead2ed01df1a21554d4b3862 --- /dev/null +++ b/checkpoint-16500/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3f44e600a0377090145d494deff29629d60bdb82a14122ef9f0ae63b069cef1 +size 4938446392 diff --git a/checkpoint-16500/model-00002-of-00002.safetensors b/checkpoint-16500/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a6d6ef6b505cca3f8cc92d9a53a2c92a32f86e1a --- /dev/null +++ b/checkpoint-16500/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b75e0739e9492b48f86b0599a00801fbb8a0457b33b0d669400cce540358a34e +size 3821736024 diff --git a/checkpoint-16500/model.safetensors.index.json b/checkpoint-16500/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-16500/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-16500/optimizer.pt b/checkpoint-16500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b40e9c28cc765b365bbdd0f7351fd47953aaae1a --- /dev/null +++ b/checkpoint-16500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e2a75972d94063164bed1cf6988277c7f82c58e5f9ae50ff160cbf79390b4b7 +size 10272357262 diff --git a/checkpoint-16500/rng_state.pth b/checkpoint-16500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..313b202cf4444dabe2a5d24ab5b2951de4c1b91e --- /dev/null +++ b/checkpoint-16500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7121ae7ee8a0fde0c840794911bd767a39997d7c9bb32ff66140651ab9f7d5cb +size 14244 diff --git a/checkpoint-16500/scheduler.pt b/checkpoint-16500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..66d2bc204c2e6cec97c0013ebfd334017a35574d --- /dev/null +++ b/checkpoint-16500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e516b2cee23cefc4b68cd5045f7ccb7dd886a07aa3b0be782363db9cf385bf96 +size 1064 diff --git a/checkpoint-16500/trainer_state.json b/checkpoint-16500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d3362e9563db4a56f0c3ac9763cb7d511e299a1 --- /dev/null +++ b/checkpoint-16500/trainer_state.json @@ -0,0 +1,11583 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.23335198656967, + "eval_steps": 500, + "global_step": 16500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.850378233207195e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-17000/config.json b/checkpoint-17000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-17000/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-17000/experiment_cfg/metadata.json b/checkpoint-17000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-17000/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-17000/model-00001-of-00002.safetensors b/checkpoint-17000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d177b6cd8a2c9dcb9c4d367b09965063889a342 --- /dev/null +++ b/checkpoint-17000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83006184c33fed3978ad8496a96f664913504244e32120edbfefca714649219a +size 4938446392 diff --git a/checkpoint-17000/model-00002-of-00002.safetensors b/checkpoint-17000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c6bcd99ad1c4a34ed374d04dc4bc2e9979af9a72 --- /dev/null +++ b/checkpoint-17000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2e4a4db8e6177dcabb195be4d5dca198452b4e485a19b810e5322961044ab3 +size 3821736024 diff --git a/checkpoint-17000/model.safetensors.index.json b/checkpoint-17000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-17000/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-17000/optimizer.pt b/checkpoint-17000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b91bacbb4eed2cad5423fee743f2f5129808e031 --- /dev/null +++ b/checkpoint-17000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:710bd9e6e4881ec6e311bed6af7afe8a7084211f5c393824c2c48488392b4ddd +size 10272357262 diff --git a/checkpoint-17000/rng_state.pth b/checkpoint-17000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f41d21f6711199b1c1c2805678ff790e0c205b8 --- /dev/null +++ b/checkpoint-17000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8953e4862c100ef835187d8cc199359c0cfa51c1cb903ab46ecc8f66668ea62a +size 14244 diff --git a/checkpoint-17000/scheduler.pt b/checkpoint-17000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3311f683c24a467f8803297160d37eb9ee0fe0ed --- /dev/null +++ b/checkpoint-17000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2fa06e549abfdcd5897974c6ccae77ef9f8517652438a6c6fb1500d8d81beb +size 1064 diff --git a/checkpoint-17000/trainer_state.json b/checkpoint-17000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc2fe31ca6f67fb140a39db4e9d20b02d21409 --- /dev/null +++ b/checkpoint-17000/trainer_state.json @@ -0,0 +1,11933 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.513150531617235, + "eval_steps": 500, + "global_step": 17000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.027728928603035e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-17500/config.json b/checkpoint-17500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-17500/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-17500/experiment_cfg/metadata.json b/checkpoint-17500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-17500/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-17500/model-00001-of-00002.safetensors b/checkpoint-17500/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95ffe58f8b085341cc6221afcf13e6e77acf3d99 --- /dev/null +++ b/checkpoint-17500/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86de43f18ba01b62339edd0bbae260c9f05f50e7d10bf822fc8c3549ed35d87e +size 4938446392 diff --git a/checkpoint-17500/model-00002-of-00002.safetensors b/checkpoint-17500/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64920bc5492d3ad500d53124772f0d9ae7dd83b6 --- /dev/null +++ b/checkpoint-17500/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a568a1207da9605d5c25d07f46a00c7f52fc70350c5369bd3a1f5a1f49173a0 +size 3821736024 diff --git a/checkpoint-17500/model.safetensors.index.json b/checkpoint-17500/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-17500/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-17500/optimizer.pt b/checkpoint-17500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2e6cd966c34ea650ac331d2fd7d8ad63424ac14 --- /dev/null +++ b/checkpoint-17500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d79d59a9daed496604a64d4bf56c925b43b9323329896650f15f0c1685e9596 +size 10272357262 diff --git a/checkpoint-17500/rng_state.pth b/checkpoint-17500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a817e9473ee7db008b4b0e2198aa16c656757fde --- /dev/null +++ b/checkpoint-17500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1a2719277bd1edde3b6dee4eb3a8a20d873210df6273ee5e5923159e998d2b +size 14244 diff --git a/checkpoint-17500/scheduler.pt b/checkpoint-17500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cec86ab3797409ebc27917d7385120c12cb101e --- /dev/null +++ b/checkpoint-17500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:339d2ec8c8224241e4fdac4aed961ed0adcc89e454fd999e8b353c16b5d45e6b +size 1064 diff --git a/checkpoint-17500/trainer_state.json b/checkpoint-17500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2bd5b6c09c85a698e942917a4c107528c7eeb5cb --- /dev/null +++ b/checkpoint-17500/trainer_state.json @@ -0,0 +1,12283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.7929490766648, + "eval_steps": 500, + "global_step": 17500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.205079623998875e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18000/config.json b/checkpoint-18000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-18000/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-18000/experiment_cfg/metadata.json b/checkpoint-18000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-18000/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-18000/model-00001-of-00002.safetensors b/checkpoint-18000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5115903516c24c5129bff11c1455810e6e2684a7 --- /dev/null +++ b/checkpoint-18000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922040006a3d331da7f5a61b8f992a4f9d6c10ea872efc10c3bb3a6ea235a5b0 +size 4938446392 diff --git a/checkpoint-18000/model-00002-of-00002.safetensors b/checkpoint-18000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a92d2b5d1d0365ef8f3c4c620b02b3b6e3ba1f7c --- /dev/null +++ b/checkpoint-18000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48843fdc8a8ab892f3ce231924e7139d5fbd5a0e40363a6f49c3e38074920d23 +size 3821736024 diff --git a/checkpoint-18000/model.safetensors.index.json b/checkpoint-18000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-18000/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-18000/optimizer.pt b/checkpoint-18000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..db6c805434781bcde4cbb2d29c5db6d8c7e74853 --- /dev/null +++ b/checkpoint-18000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dcf64be3a6610009784b29c3ee9654815e96e7dea46c3693735f67d88755790 +size 10272357262 diff --git a/checkpoint-18000/rng_state.pth b/checkpoint-18000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4c56bbec1fdf76a4c63fc335162d13f85427ea8 --- /dev/null +++ b/checkpoint-18000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7996a9a9f6343c38a740b3a82b89286b542091737169ad65da0d1bfe449e3f +size 14244 diff --git a/checkpoint-18000/scheduler.pt b/checkpoint-18000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a78fe4001dc8fe34dced7c163122288d821400be --- /dev/null +++ b/checkpoint-18000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde5ad20c99e88c9018bd57b1aa9c6baea596e7481451a33714b81068c5f8c3c +size 1064 diff --git a/checkpoint-18000/trainer_state.json b/checkpoint-18000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ed1113e3a3cffb7320fcbf308705ebb9b2be9ee --- /dev/null +++ b/checkpoint-18000/trainer_state.json @@ -0,0 +1,12633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.072747621712367, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.382186462188545e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18500/config.json b/checkpoint-18500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-18500/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-18500/experiment_cfg/metadata.json b/checkpoint-18500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-18500/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-18500/model-00001-of-00002.safetensors b/checkpoint-18500/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2affd87229b991c14a3af6afe696e94ea8b2187 --- /dev/null +++ b/checkpoint-18500/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59885fc01f41b6360c6d93857e5a75d8ae6727f8370d1e4c2828ec8354624d38 +size 4938446392 diff --git a/checkpoint-18500/model-00002-of-00002.safetensors b/checkpoint-18500/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c72d42fe6b1c640a281931b329fabc7d72b1901 --- /dev/null +++ b/checkpoint-18500/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd899c13a983d834f0f8b94097b3dc031310796ed13a4cac942be53bacfacaf5 +size 3821736024 diff --git a/checkpoint-18500/model.safetensors.index.json b/checkpoint-18500/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-18500/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-18500/optimizer.pt b/checkpoint-18500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..07fa14444139d77063a2f1b7311afdfd44801f32 --- /dev/null +++ b/checkpoint-18500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb30d82a58ffe6daa12ff003e1eb8ed989b0eeb6ce15e59ba5bf8bddf485b78 +size 10272357262 diff --git a/checkpoint-18500/rng_state.pth b/checkpoint-18500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec2f7af8ca3863f4eed02cd18c02533ace288656 --- /dev/null +++ b/checkpoint-18500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:593a05b7a935843c766aa788d9261f0f17fe8ab557241cbba3bd99f72a463963 +size 14244 diff --git a/checkpoint-18500/scheduler.pt b/checkpoint-18500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..be34647881ea91657113757bb2984323265aea59 --- /dev/null +++ b/checkpoint-18500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9d3be904528e69716f7e36f8255818a7d917ce1ea3719c2a28917b7c1db694b +size 1064 diff --git a/checkpoint-18500/trainer_state.json b/checkpoint-18500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..597bec226ad5a481d40ad4f7a9bd6aac44dd852f --- /dev/null +++ b/checkpoint-18500/trainer_state.json @@ -0,0 +1,12983 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.352546166759932, + "eval_steps": 500, + "global_step": 18500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + }, + { + "epoch": 10.078343592613319, + "grad_norm": 0.07472005486488342, + "learning_rate": 2.682358534298285e-06, + "loss": 0.006, + "step": 18010 + }, + { + "epoch": 10.083939563514269, + "grad_norm": 0.09040942043066025, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.004, + "step": 18020 + }, + { + "epoch": 10.089535534415221, + "grad_norm": 0.037220001220703125, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0038, + "step": 18030 + }, + { + "epoch": 10.095131505316173, + "grad_norm": 0.11240635067224503, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0031, + "step": 18040 + }, + { + "epoch": 10.100727476217124, + "grad_norm": 0.12259605526924133, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0049, + "step": 18050 + }, + { + "epoch": 10.106323447118076, + "grad_norm": 0.16758129000663757, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0066, + "step": 18060 + }, + { + "epoch": 10.111919418019026, + "grad_norm": 0.10704974085092545, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.0021, + "step": 18070 + }, + { + "epoch": 10.117515388919978, + "grad_norm": 0.19684171676635742, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0035, + "step": 18080 + }, + { + "epoch": 10.123111359820928, + "grad_norm": 0.25069093704223633, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0038, + "step": 18090 + }, + { + "epoch": 10.12870733072188, + "grad_norm": 0.15888355672359467, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0055, + "step": 18100 + }, + { + "epoch": 10.13430330162283, + "grad_norm": 0.1355513483285904, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0032, + "step": 18110 + }, + { + "epoch": 10.139899272523783, + "grad_norm": 0.09521888941526413, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0072, + "step": 18120 + }, + { + "epoch": 10.145495243424735, + "grad_norm": 0.18774038553237915, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0028, + "step": 18130 + }, + { + "epoch": 10.151091214325685, + "grad_norm": 0.06293562054634094, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0032, + "step": 18140 + }, + { + "epoch": 10.156687185226637, + "grad_norm": 0.25169095396995544, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0092, + "step": 18150 + }, + { + "epoch": 10.162283156127588, + "grad_norm": 0.08376752585172653, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0021, + "step": 18160 + }, + { + "epoch": 10.16787912702854, + "grad_norm": 0.06758670508861542, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 10.17347509792949, + "grad_norm": 0.06193256378173828, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0036, + "step": 18180 + }, + { + "epoch": 10.179071068830442, + "grad_norm": 0.21087805926799774, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0043, + "step": 18190 + }, + { + "epoch": 10.184667039731393, + "grad_norm": 0.09581877291202545, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0048, + "step": 18200 + }, + { + "epoch": 10.190263010632345, + "grad_norm": 0.23187117278575897, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0068, + "step": 18210 + }, + { + "epoch": 10.195858981533297, + "grad_norm": 0.1904383897781372, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0036, + "step": 18220 + }, + { + "epoch": 10.201454952434247, + "grad_norm": 0.04588289558887482, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0075, + "step": 18230 + }, + { + "epoch": 10.2070509233352, + "grad_norm": 0.05446457862854004, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 10.21264689423615, + "grad_norm": 0.12907229363918304, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 10.218242865137102, + "grad_norm": 0.0724627822637558, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0024, + "step": 18260 + }, + { + "epoch": 10.223838836038052, + "grad_norm": 0.11797565221786499, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0032, + "step": 18270 + }, + { + "epoch": 10.229434806939004, + "grad_norm": 0.1239556148648262, + "learning_rate": 2.008446509461498e-06, + "loss": 0.0038, + "step": 18280 + }, + { + "epoch": 10.235030777839956, + "grad_norm": 0.05614084377884865, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0041, + "step": 18290 + }, + { + "epoch": 10.240626748740906, + "grad_norm": 0.2968387007713318, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0092, + "step": 18300 + }, + { + "epoch": 10.246222719641858, + "grad_norm": 0.11529407650232315, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0059, + "step": 18310 + }, + { + "epoch": 10.251818690542809, + "grad_norm": 0.24037353694438934, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0045, + "step": 18320 + }, + { + "epoch": 10.257414661443761, + "grad_norm": 0.20346900820732117, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 10.263010632344711, + "grad_norm": 0.27883380651474, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0049, + "step": 18340 + }, + { + "epoch": 10.268606603245663, + "grad_norm": 0.12232355028390884, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0077, + "step": 18350 + }, + { + "epoch": 10.274202574146614, + "grad_norm": 0.09397400170564651, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0039, + "step": 18360 + }, + { + "epoch": 10.279798545047566, + "grad_norm": 0.13843244314193726, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.0059, + "step": 18370 + }, + { + "epoch": 10.285394515948518, + "grad_norm": 0.04927824065089226, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.003, + "step": 18380 + }, + { + "epoch": 10.290990486849468, + "grad_norm": 0.2848436236381531, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.004, + "step": 18390 + }, + { + "epoch": 10.29658645775042, + "grad_norm": 0.10808296501636505, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0072, + "step": 18400 + }, + { + "epoch": 10.30218242865137, + "grad_norm": 0.08363109827041626, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0029, + "step": 18410 + }, + { + "epoch": 10.307778399552323, + "grad_norm": 0.07970738410949707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0058, + "step": 18420 + }, + { + "epoch": 10.313374370453273, + "grad_norm": 0.06155739724636078, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0042, + "step": 18430 + }, + { + "epoch": 10.318970341354225, + "grad_norm": 0.15518154203891754, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.006, + "step": 18440 + }, + { + "epoch": 10.324566312255175, + "grad_norm": 0.06478218734264374, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0083, + "step": 18450 + }, + { + "epoch": 10.330162283156128, + "grad_norm": 0.11871203780174255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 10.33575825405708, + "grad_norm": 0.13164940476417542, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 10.34135422495803, + "grad_norm": 0.08165992051362991, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0065, + "step": 18480 + }, + { + "epoch": 10.346950195858982, + "grad_norm": 0.06479761004447937, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0044, + "step": 18490 + }, + { + "epoch": 10.352546166759932, + "grad_norm": 0.24080127477645874, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0085, + "step": 18500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.559537157584385e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-19000/config.json b/checkpoint-19000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-19000/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-19000/experiment_cfg/metadata.json b/checkpoint-19000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-19000/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-19000/model-00001-of-00002.safetensors b/checkpoint-19000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9d22f7d4a0c1ee6649c3ec7fdeec1003f45a203 --- /dev/null +++ b/checkpoint-19000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1529b6c13af6a535511abecc7b74a9f117cd59288959a1df8f1a1e8c700018ae +size 4938446392 diff --git a/checkpoint-19000/model-00002-of-00002.safetensors b/checkpoint-19000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1d072086dd83f66c42535ed2d3cb23f5c6313f72 --- /dev/null +++ b/checkpoint-19000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2e93f40933ced98c9b1b8354bed3743dee03a68948598a96f2cdaa243b36d3 +size 3821736024 diff --git a/checkpoint-19000/model.safetensors.index.json b/checkpoint-19000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-19000/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-19000/optimizer.pt b/checkpoint-19000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..42219f8e368e31d872e8f22f294f89e22b45c034 --- /dev/null +++ b/checkpoint-19000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50223aebfa3da2266651551e85954ac96b847dbd3b7062e930f6390126cac61e +size 10272357262 diff --git a/checkpoint-19000/rng_state.pth b/checkpoint-19000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95750f215c3a61235b6ef98f4f8f64062fb27724 --- /dev/null +++ b/checkpoint-19000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cb59e87a56791f3e0415532e208486f622cafccfa4194c5e7c770a00daf5bc +size 14244 diff --git a/checkpoint-19000/scheduler.pt b/checkpoint-19000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2400d7ba454dffb6376879049add1f279d2d6dea --- /dev/null +++ b/checkpoint-19000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6497fee0e75f0e896356192ff73d0b494e5ff2ffafa26e148f4194f6eee2f10 +size 1064 diff --git a/checkpoint-19000/trainer_state.json b/checkpoint-19000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5abc0759faef4a7e1730b323652f267f11b35509 --- /dev/null +++ b/checkpoint-19000/trainer_state.json @@ -0,0 +1,13333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.632344711807498, + "eval_steps": 500, + "global_step": 19000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + }, + { + "epoch": 10.078343592613319, + "grad_norm": 0.07472005486488342, + "learning_rate": 2.682358534298285e-06, + "loss": 0.006, + "step": 18010 + }, + { + "epoch": 10.083939563514269, + "grad_norm": 0.09040942043066025, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.004, + "step": 18020 + }, + { + "epoch": 10.089535534415221, + "grad_norm": 0.037220001220703125, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0038, + "step": 18030 + }, + { + "epoch": 10.095131505316173, + "grad_norm": 0.11240635067224503, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0031, + "step": 18040 + }, + { + "epoch": 10.100727476217124, + "grad_norm": 0.12259605526924133, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0049, + "step": 18050 + }, + { + "epoch": 10.106323447118076, + "grad_norm": 0.16758129000663757, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0066, + "step": 18060 + }, + { + "epoch": 10.111919418019026, + "grad_norm": 0.10704974085092545, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.0021, + "step": 18070 + }, + { + "epoch": 10.117515388919978, + "grad_norm": 0.19684171676635742, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0035, + "step": 18080 + }, + { + "epoch": 10.123111359820928, + "grad_norm": 0.25069093704223633, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0038, + "step": 18090 + }, + { + "epoch": 10.12870733072188, + "grad_norm": 0.15888355672359467, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0055, + "step": 18100 + }, + { + "epoch": 10.13430330162283, + "grad_norm": 0.1355513483285904, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0032, + "step": 18110 + }, + { + "epoch": 10.139899272523783, + "grad_norm": 0.09521888941526413, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0072, + "step": 18120 + }, + { + "epoch": 10.145495243424735, + "grad_norm": 0.18774038553237915, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0028, + "step": 18130 + }, + { + "epoch": 10.151091214325685, + "grad_norm": 0.06293562054634094, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0032, + "step": 18140 + }, + { + "epoch": 10.156687185226637, + "grad_norm": 0.25169095396995544, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0092, + "step": 18150 + }, + { + "epoch": 10.162283156127588, + "grad_norm": 0.08376752585172653, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0021, + "step": 18160 + }, + { + "epoch": 10.16787912702854, + "grad_norm": 0.06758670508861542, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 10.17347509792949, + "grad_norm": 0.06193256378173828, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0036, + "step": 18180 + }, + { + "epoch": 10.179071068830442, + "grad_norm": 0.21087805926799774, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0043, + "step": 18190 + }, + { + "epoch": 10.184667039731393, + "grad_norm": 0.09581877291202545, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0048, + "step": 18200 + }, + { + "epoch": 10.190263010632345, + "grad_norm": 0.23187117278575897, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0068, + "step": 18210 + }, + { + "epoch": 10.195858981533297, + "grad_norm": 0.1904383897781372, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0036, + "step": 18220 + }, + { + "epoch": 10.201454952434247, + "grad_norm": 0.04588289558887482, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0075, + "step": 18230 + }, + { + "epoch": 10.2070509233352, + "grad_norm": 0.05446457862854004, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 10.21264689423615, + "grad_norm": 0.12907229363918304, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 10.218242865137102, + "grad_norm": 0.0724627822637558, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0024, + "step": 18260 + }, + { + "epoch": 10.223838836038052, + "grad_norm": 0.11797565221786499, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0032, + "step": 18270 + }, + { + "epoch": 10.229434806939004, + "grad_norm": 0.1239556148648262, + "learning_rate": 2.008446509461498e-06, + "loss": 0.0038, + "step": 18280 + }, + { + "epoch": 10.235030777839956, + "grad_norm": 0.05614084377884865, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0041, + "step": 18290 + }, + { + "epoch": 10.240626748740906, + "grad_norm": 0.2968387007713318, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0092, + "step": 18300 + }, + { + "epoch": 10.246222719641858, + "grad_norm": 0.11529407650232315, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0059, + "step": 18310 + }, + { + "epoch": 10.251818690542809, + "grad_norm": 0.24037353694438934, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0045, + "step": 18320 + }, + { + "epoch": 10.257414661443761, + "grad_norm": 0.20346900820732117, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 10.263010632344711, + "grad_norm": 0.27883380651474, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0049, + "step": 18340 + }, + { + "epoch": 10.268606603245663, + "grad_norm": 0.12232355028390884, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0077, + "step": 18350 + }, + { + "epoch": 10.274202574146614, + "grad_norm": 0.09397400170564651, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0039, + "step": 18360 + }, + { + "epoch": 10.279798545047566, + "grad_norm": 0.13843244314193726, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.0059, + "step": 18370 + }, + { + "epoch": 10.285394515948518, + "grad_norm": 0.04927824065089226, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.003, + "step": 18380 + }, + { + "epoch": 10.290990486849468, + "grad_norm": 0.2848436236381531, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.004, + "step": 18390 + }, + { + "epoch": 10.29658645775042, + "grad_norm": 0.10808296501636505, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0072, + "step": 18400 + }, + { + "epoch": 10.30218242865137, + "grad_norm": 0.08363109827041626, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0029, + "step": 18410 + }, + { + "epoch": 10.307778399552323, + "grad_norm": 0.07970738410949707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0058, + "step": 18420 + }, + { + "epoch": 10.313374370453273, + "grad_norm": 0.06155739724636078, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0042, + "step": 18430 + }, + { + "epoch": 10.318970341354225, + "grad_norm": 0.15518154203891754, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.006, + "step": 18440 + }, + { + "epoch": 10.324566312255175, + "grad_norm": 0.06478218734264374, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0083, + "step": 18450 + }, + { + "epoch": 10.330162283156128, + "grad_norm": 0.11871203780174255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 10.33575825405708, + "grad_norm": 0.13164940476417542, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 10.34135422495803, + "grad_norm": 0.08165992051362991, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0065, + "step": 18480 + }, + { + "epoch": 10.346950195858982, + "grad_norm": 0.06479761004447937, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0044, + "step": 18490 + }, + { + "epoch": 10.352546166759932, + "grad_norm": 0.24080127477645874, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0085, + "step": 18500 + }, + { + "epoch": 10.358142137660884, + "grad_norm": 0.1411421000957489, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0045, + "step": 18510 + }, + { + "epoch": 10.363738108561835, + "grad_norm": 0.058580052107572556, + "learning_rate": 1.4896615227983468e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 10.369334079462787, + "grad_norm": 0.1638147383928299, + "learning_rate": 1.4696978433290653e-06, + "loss": 0.0054, + "step": 18530 + }, + { + "epoch": 10.374930050363739, + "grad_norm": 0.05566524341702461, + "learning_rate": 1.4498668438527597e-06, + "loss": 0.004, + "step": 18540 + }, + { + "epoch": 10.38052602126469, + "grad_norm": 0.07601140439510345, + "learning_rate": 1.4301685785866214e-06, + "loss": 0.0034, + "step": 18550 + }, + { + "epoch": 10.386121992165641, + "grad_norm": 0.10449633747339249, + "learning_rate": 1.4106031013849496e-06, + "loss": 0.0041, + "step": 18560 + }, + { + "epoch": 10.391717963066592, + "grad_norm": 0.15937356650829315, + "learning_rate": 1.3911704657390113e-06, + "loss": 0.0039, + "step": 18570 + }, + { + "epoch": 10.397313933967544, + "grad_norm": 0.059475306421518326, + "learning_rate": 1.3718707247769135e-06, + "loss": 0.006, + "step": 18580 + }, + { + "epoch": 10.402909904868494, + "grad_norm": 0.24354378879070282, + "learning_rate": 1.3527039312633827e-06, + "loss": 0.0042, + "step": 18590 + }, + { + "epoch": 10.408505875769446, + "grad_norm": 0.20878778398036957, + "learning_rate": 1.333670137599713e-06, + "loss": 0.0107, + "step": 18600 + }, + { + "epoch": 10.414101846670397, + "grad_norm": 0.1909496784210205, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0034, + "step": 18610 + }, + { + "epoch": 10.419697817571349, + "grad_norm": 0.13632823526859283, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0066, + "step": 18620 + }, + { + "epoch": 10.4252937884723, + "grad_norm": 0.10793755203485489, + "learning_rate": 1.2773672742655784e-06, + "loss": 0.0037, + "step": 18630 + }, + { + "epoch": 10.430889759373251, + "grad_norm": 0.10346037149429321, + "learning_rate": 1.2588659967397e-06, + "loss": 0.0044, + "step": 18640 + }, + { + "epoch": 10.436485730274203, + "grad_norm": 0.08834080398082733, + "learning_rate": 1.2404979756130142e-06, + "loss": 0.0037, + "step": 18650 + }, + { + "epoch": 10.442081701175153, + "grad_norm": 0.09045784175395966, + "learning_rate": 1.222263261102985e-06, + "loss": 0.0052, + "step": 18660 + }, + { + "epoch": 10.447677672076106, + "grad_norm": 0.07731129229068756, + "learning_rate": 1.2041619030626284e-06, + "loss": 0.0071, + "step": 18670 + }, + { + "epoch": 10.453273642977056, + "grad_norm": 0.08769071102142334, + "learning_rate": 1.1861939509803687e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 10.458869613878008, + "grad_norm": 0.15766629576683044, + "learning_rate": 1.1683594539798893e-06, + "loss": 0.0063, + "step": 18690 + }, + { + "epoch": 10.46446558477896, + "grad_norm": 0.11048921942710876, + "learning_rate": 1.1506584608200367e-06, + "loss": 0.0033, + "step": 18700 + }, + { + "epoch": 10.47006155567991, + "grad_norm": 0.25674813985824585, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0047, + "step": 18710 + }, + { + "epoch": 10.475657526580862, + "grad_norm": 0.09696432203054428, + "learning_rate": 1.1156571792324211e-06, + "loss": 0.0038, + "step": 18720 + }, + { + "epoch": 10.481253497481813, + "grad_norm": 0.17716100811958313, + "learning_rate": 1.0983569864968346e-06, + "loss": 0.0085, + "step": 18730 + }, + { + "epoch": 10.486849468382765, + "grad_norm": 0.18763263523578644, + "learning_rate": 1.0811904889859336e-06, + "loss": 0.009, + "step": 18740 + }, + { + "epoch": 10.492445439283715, + "grad_norm": 0.047968145459890366, + "learning_rate": 1.064157733632276e-06, + "loss": 0.0051, + "step": 18750 + }, + { + "epoch": 10.498041410184667, + "grad_norm": 0.1565999537706375, + "learning_rate": 1.0472587670027678e-06, + "loss": 0.0062, + "step": 18760 + }, + { + "epoch": 10.503637381085618, + "grad_norm": 0.06519567221403122, + "learning_rate": 1.030493635298535e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 10.50923335198657, + "grad_norm": 0.10364692658185959, + "learning_rate": 1.0138623843548078e-06, + "loss": 0.0051, + "step": 18780 + }, + { + "epoch": 10.514829322887522, + "grad_norm": 0.036633651703596115, + "learning_rate": 9.97365059640787e-07, + "loss": 0.0062, + "step": 18790 + }, + { + "epoch": 10.520425293788472, + "grad_norm": 0.2015930861234665, + "learning_rate": 9.810017062595322e-07, + "loss": 0.0037, + "step": 18800 + }, + { + "epoch": 10.526021264689424, + "grad_norm": 0.1180974468588829, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0039, + "step": 18810 + }, + { + "epoch": 10.531617235590375, + "grad_norm": 0.07416771352291107, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0041, + "step": 18820 + }, + { + "epoch": 10.537213206491327, + "grad_norm": 0.05668334290385246, + "learning_rate": 9.327159196481138e-07, + "loss": 0.0059, + "step": 18830 + }, + { + "epoch": 10.542809177392277, + "grad_norm": 0.07584750652313232, + "learning_rate": 9.168888953011989e-07, + "loss": 0.0054, + "step": 18840 + }, + { + "epoch": 10.548405148293229, + "grad_norm": 0.06703902035951614, + "learning_rate": 9.011960623058202e-07, + "loss": 0.0039, + "step": 18850 + }, + { + "epoch": 10.55400111919418, + "grad_norm": 0.06538796424865723, + "learning_rate": 8.856374635655695e-07, + "loss": 0.0035, + "step": 18860 + }, + { + "epoch": 10.559597090095131, + "grad_norm": 0.09234767407178879, + "learning_rate": 8.702131416170656e-07, + "loss": 0.0047, + "step": 18870 + }, + { + "epoch": 10.565193060996084, + "grad_norm": 0.09068552404642105, + "learning_rate": 8.549231386298151e-07, + "loss": 0.0032, + "step": 18880 + }, + { + "epoch": 10.570789031897034, + "grad_norm": 0.2574044466018677, + "learning_rate": 8.397674964061075e-07, + "loss": 0.0123, + "step": 18890 + }, + { + "epoch": 10.576385002797986, + "grad_norm": 0.1742398738861084, + "learning_rate": 8.247462563808817e-07, + "loss": 0.005, + "step": 18900 + }, + { + "epoch": 10.581980973698936, + "grad_norm": 0.19498533010482788, + "learning_rate": 8.098594596216424e-07, + "loss": 0.0051, + "step": 18910 + }, + { + "epoch": 10.587576944599888, + "grad_norm": 0.1093849390745163, + "learning_rate": 7.951071468283167e-07, + "loss": 0.0062, + "step": 18920 + }, + { + "epoch": 10.593172915500839, + "grad_norm": 0.05242215842008591, + "learning_rate": 7.804893583331696e-07, + "loss": 0.0049, + "step": 18930 + }, + { + "epoch": 10.59876888640179, + "grad_norm": 0.06830724328756332, + "learning_rate": 7.66006134100672e-07, + "loss": 0.0031, + "step": 18940 + }, + { + "epoch": 10.604364857302741, + "grad_norm": 0.08541436493396759, + "learning_rate": 7.516575137274162e-07, + "loss": 0.0044, + "step": 18950 + }, + { + "epoch": 10.609960828203693, + "grad_norm": 0.042029768228530884, + "learning_rate": 7.374435364419674e-07, + "loss": 0.0043, + "step": 18960 + }, + { + "epoch": 10.615556799104645, + "grad_norm": 0.12100391089916229, + "learning_rate": 7.233642411048014e-07, + "loss": 0.0032, + "step": 18970 + }, + { + "epoch": 10.621152770005596, + "grad_norm": 0.04842936620116234, + "learning_rate": 7.094196662081831e-07, + "loss": 0.0052, + "step": 18980 + }, + { + "epoch": 10.626748740906548, + "grad_norm": 0.13397961854934692, + "learning_rate": 6.956098498760389e-07, + "loss": 0.0056, + "step": 18990 + }, + { + "epoch": 10.632344711807498, + "grad_norm": 0.19486455619335175, + "learning_rate": 6.819348298638839e-07, + "loss": 0.0029, + "step": 19000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.736887852980225e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-19500/config.json b/checkpoint-19500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-19500/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-19500/experiment_cfg/metadata.json b/checkpoint-19500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-19500/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-19500/model-00001-of-00002.safetensors b/checkpoint-19500/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c18aa25e3471c3fb8bca79f4fa0ce038dc64462 --- /dev/null +++ b/checkpoint-19500/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3388ac07a1fa3105f69e6bcb128b4de8ec95f2a98cce784fcb3d52380bbd2215 +size 4938446392 diff --git a/checkpoint-19500/model-00002-of-00002.safetensors b/checkpoint-19500/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5da737ac5c44f562218e6cd6f486f7678367198 --- /dev/null +++ b/checkpoint-19500/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45434fbd1ab7dcbe1a35a9edbe74726f936e38a793c1f011555e38a42dd6bf83 +size 3821736024 diff --git a/checkpoint-19500/model.safetensors.index.json b/checkpoint-19500/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-19500/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-19500/optimizer.pt b/checkpoint-19500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..89a17ad295da76be8aedd807449086d9fa57dbb3 --- /dev/null +++ b/checkpoint-19500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69fa7126fb47138b10925902f042944ee52272e7ec2429915a338f4f59c3118f +size 10272357262 diff --git a/checkpoint-19500/rng_state.pth b/checkpoint-19500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9bf4b5d4aacddab729ef1879ac45afd6991a722c --- /dev/null +++ b/checkpoint-19500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d10c596295c8876f5ef798e368f61bfe1fe2e1ace944eb617e428312a78a5e5 +size 14244 diff --git a/checkpoint-19500/scheduler.pt b/checkpoint-19500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc4884d3dc0903e773c9cb48fadaae4238f170ff --- /dev/null +++ b/checkpoint-19500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494281cda91aabc11e1fa705e567566ebd642376fa08323e1b67958cd76e1241 +size 1064 diff --git a/checkpoint-19500/trainer_state.json b/checkpoint-19500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..57bccff44eac8a3185e48dae9a0b23b44d3a0c33 --- /dev/null +++ b/checkpoint-19500/trainer_state.json @@ -0,0 +1,13683 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.912143256855064, + "eval_steps": 500, + "global_step": 19500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + }, + { + "epoch": 10.078343592613319, + "grad_norm": 0.07472005486488342, + "learning_rate": 2.682358534298285e-06, + "loss": 0.006, + "step": 18010 + }, + { + "epoch": 10.083939563514269, + "grad_norm": 0.09040942043066025, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.004, + "step": 18020 + }, + { + "epoch": 10.089535534415221, + "grad_norm": 0.037220001220703125, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0038, + "step": 18030 + }, + { + "epoch": 10.095131505316173, + "grad_norm": 0.11240635067224503, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0031, + "step": 18040 + }, + { + "epoch": 10.100727476217124, + "grad_norm": 0.12259605526924133, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0049, + "step": 18050 + }, + { + "epoch": 10.106323447118076, + "grad_norm": 0.16758129000663757, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0066, + "step": 18060 + }, + { + "epoch": 10.111919418019026, + "grad_norm": 0.10704974085092545, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.0021, + "step": 18070 + }, + { + "epoch": 10.117515388919978, + "grad_norm": 0.19684171676635742, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0035, + "step": 18080 + }, + { + "epoch": 10.123111359820928, + "grad_norm": 0.25069093704223633, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0038, + "step": 18090 + }, + { + "epoch": 10.12870733072188, + "grad_norm": 0.15888355672359467, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0055, + "step": 18100 + }, + { + "epoch": 10.13430330162283, + "grad_norm": 0.1355513483285904, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0032, + "step": 18110 + }, + { + "epoch": 10.139899272523783, + "grad_norm": 0.09521888941526413, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0072, + "step": 18120 + }, + { + "epoch": 10.145495243424735, + "grad_norm": 0.18774038553237915, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0028, + "step": 18130 + }, + { + "epoch": 10.151091214325685, + "grad_norm": 0.06293562054634094, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0032, + "step": 18140 + }, + { + "epoch": 10.156687185226637, + "grad_norm": 0.25169095396995544, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0092, + "step": 18150 + }, + { + "epoch": 10.162283156127588, + "grad_norm": 0.08376752585172653, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0021, + "step": 18160 + }, + { + "epoch": 10.16787912702854, + "grad_norm": 0.06758670508861542, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 10.17347509792949, + "grad_norm": 0.06193256378173828, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0036, + "step": 18180 + }, + { + "epoch": 10.179071068830442, + "grad_norm": 0.21087805926799774, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0043, + "step": 18190 + }, + { + "epoch": 10.184667039731393, + "grad_norm": 0.09581877291202545, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0048, + "step": 18200 + }, + { + "epoch": 10.190263010632345, + "grad_norm": 0.23187117278575897, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0068, + "step": 18210 + }, + { + "epoch": 10.195858981533297, + "grad_norm": 0.1904383897781372, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0036, + "step": 18220 + }, + { + "epoch": 10.201454952434247, + "grad_norm": 0.04588289558887482, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0075, + "step": 18230 + }, + { + "epoch": 10.2070509233352, + "grad_norm": 0.05446457862854004, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 10.21264689423615, + "grad_norm": 0.12907229363918304, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 10.218242865137102, + "grad_norm": 0.0724627822637558, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0024, + "step": 18260 + }, + { + "epoch": 10.223838836038052, + "grad_norm": 0.11797565221786499, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0032, + "step": 18270 + }, + { + "epoch": 10.229434806939004, + "grad_norm": 0.1239556148648262, + "learning_rate": 2.008446509461498e-06, + "loss": 0.0038, + "step": 18280 + }, + { + "epoch": 10.235030777839956, + "grad_norm": 0.05614084377884865, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0041, + "step": 18290 + }, + { + "epoch": 10.240626748740906, + "grad_norm": 0.2968387007713318, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0092, + "step": 18300 + }, + { + "epoch": 10.246222719641858, + "grad_norm": 0.11529407650232315, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0059, + "step": 18310 + }, + { + "epoch": 10.251818690542809, + "grad_norm": 0.24037353694438934, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0045, + "step": 18320 + }, + { + "epoch": 10.257414661443761, + "grad_norm": 0.20346900820732117, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 10.263010632344711, + "grad_norm": 0.27883380651474, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0049, + "step": 18340 + }, + { + "epoch": 10.268606603245663, + "grad_norm": 0.12232355028390884, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0077, + "step": 18350 + }, + { + "epoch": 10.274202574146614, + "grad_norm": 0.09397400170564651, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0039, + "step": 18360 + }, + { + "epoch": 10.279798545047566, + "grad_norm": 0.13843244314193726, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.0059, + "step": 18370 + }, + { + "epoch": 10.285394515948518, + "grad_norm": 0.04927824065089226, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.003, + "step": 18380 + }, + { + "epoch": 10.290990486849468, + "grad_norm": 0.2848436236381531, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.004, + "step": 18390 + }, + { + "epoch": 10.29658645775042, + "grad_norm": 0.10808296501636505, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0072, + "step": 18400 + }, + { + "epoch": 10.30218242865137, + "grad_norm": 0.08363109827041626, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0029, + "step": 18410 + }, + { + "epoch": 10.307778399552323, + "grad_norm": 0.07970738410949707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0058, + "step": 18420 + }, + { + "epoch": 10.313374370453273, + "grad_norm": 0.06155739724636078, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0042, + "step": 18430 + }, + { + "epoch": 10.318970341354225, + "grad_norm": 0.15518154203891754, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.006, + "step": 18440 + }, + { + "epoch": 10.324566312255175, + "grad_norm": 0.06478218734264374, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0083, + "step": 18450 + }, + { + "epoch": 10.330162283156128, + "grad_norm": 0.11871203780174255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 10.33575825405708, + "grad_norm": 0.13164940476417542, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 10.34135422495803, + "grad_norm": 0.08165992051362991, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0065, + "step": 18480 + }, + { + "epoch": 10.346950195858982, + "grad_norm": 0.06479761004447937, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0044, + "step": 18490 + }, + { + "epoch": 10.352546166759932, + "grad_norm": 0.24080127477645874, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0085, + "step": 18500 + }, + { + "epoch": 10.358142137660884, + "grad_norm": 0.1411421000957489, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0045, + "step": 18510 + }, + { + "epoch": 10.363738108561835, + "grad_norm": 0.058580052107572556, + "learning_rate": 1.4896615227983468e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 10.369334079462787, + "grad_norm": 0.1638147383928299, + "learning_rate": 1.4696978433290653e-06, + "loss": 0.0054, + "step": 18530 + }, + { + "epoch": 10.374930050363739, + "grad_norm": 0.05566524341702461, + "learning_rate": 1.4498668438527597e-06, + "loss": 0.004, + "step": 18540 + }, + { + "epoch": 10.38052602126469, + "grad_norm": 0.07601140439510345, + "learning_rate": 1.4301685785866214e-06, + "loss": 0.0034, + "step": 18550 + }, + { + "epoch": 10.386121992165641, + "grad_norm": 0.10449633747339249, + "learning_rate": 1.4106031013849496e-06, + "loss": 0.0041, + "step": 18560 + }, + { + "epoch": 10.391717963066592, + "grad_norm": 0.15937356650829315, + "learning_rate": 1.3911704657390113e-06, + "loss": 0.0039, + "step": 18570 + }, + { + "epoch": 10.397313933967544, + "grad_norm": 0.059475306421518326, + "learning_rate": 1.3718707247769135e-06, + "loss": 0.006, + "step": 18580 + }, + { + "epoch": 10.402909904868494, + "grad_norm": 0.24354378879070282, + "learning_rate": 1.3527039312633827e-06, + "loss": 0.0042, + "step": 18590 + }, + { + "epoch": 10.408505875769446, + "grad_norm": 0.20878778398036957, + "learning_rate": 1.333670137599713e-06, + "loss": 0.0107, + "step": 18600 + }, + { + "epoch": 10.414101846670397, + "grad_norm": 0.1909496784210205, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0034, + "step": 18610 + }, + { + "epoch": 10.419697817571349, + "grad_norm": 0.13632823526859283, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0066, + "step": 18620 + }, + { + "epoch": 10.4252937884723, + "grad_norm": 0.10793755203485489, + "learning_rate": 1.2773672742655784e-06, + "loss": 0.0037, + "step": 18630 + }, + { + "epoch": 10.430889759373251, + "grad_norm": 0.10346037149429321, + "learning_rate": 1.2588659967397e-06, + "loss": 0.0044, + "step": 18640 + }, + { + "epoch": 10.436485730274203, + "grad_norm": 0.08834080398082733, + "learning_rate": 1.2404979756130142e-06, + "loss": 0.0037, + "step": 18650 + }, + { + "epoch": 10.442081701175153, + "grad_norm": 0.09045784175395966, + "learning_rate": 1.222263261102985e-06, + "loss": 0.0052, + "step": 18660 + }, + { + "epoch": 10.447677672076106, + "grad_norm": 0.07731129229068756, + "learning_rate": 1.2041619030626284e-06, + "loss": 0.0071, + "step": 18670 + }, + { + "epoch": 10.453273642977056, + "grad_norm": 0.08769071102142334, + "learning_rate": 1.1861939509803687e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 10.458869613878008, + "grad_norm": 0.15766629576683044, + "learning_rate": 1.1683594539798893e-06, + "loss": 0.0063, + "step": 18690 + }, + { + "epoch": 10.46446558477896, + "grad_norm": 0.11048921942710876, + "learning_rate": 1.1506584608200367e-06, + "loss": 0.0033, + "step": 18700 + }, + { + "epoch": 10.47006155567991, + "grad_norm": 0.25674813985824585, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0047, + "step": 18710 + }, + { + "epoch": 10.475657526580862, + "grad_norm": 0.09696432203054428, + "learning_rate": 1.1156571792324211e-06, + "loss": 0.0038, + "step": 18720 + }, + { + "epoch": 10.481253497481813, + "grad_norm": 0.17716100811958313, + "learning_rate": 1.0983569864968346e-06, + "loss": 0.0085, + "step": 18730 + }, + { + "epoch": 10.486849468382765, + "grad_norm": 0.18763263523578644, + "learning_rate": 1.0811904889859336e-06, + "loss": 0.009, + "step": 18740 + }, + { + "epoch": 10.492445439283715, + "grad_norm": 0.047968145459890366, + "learning_rate": 1.064157733632276e-06, + "loss": 0.0051, + "step": 18750 + }, + { + "epoch": 10.498041410184667, + "grad_norm": 0.1565999537706375, + "learning_rate": 1.0472587670027678e-06, + "loss": 0.0062, + "step": 18760 + }, + { + "epoch": 10.503637381085618, + "grad_norm": 0.06519567221403122, + "learning_rate": 1.030493635298535e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 10.50923335198657, + "grad_norm": 0.10364692658185959, + "learning_rate": 1.0138623843548078e-06, + "loss": 0.0051, + "step": 18780 + }, + { + "epoch": 10.514829322887522, + "grad_norm": 0.036633651703596115, + "learning_rate": 9.97365059640787e-07, + "loss": 0.0062, + "step": 18790 + }, + { + "epoch": 10.520425293788472, + "grad_norm": 0.2015930861234665, + "learning_rate": 9.810017062595322e-07, + "loss": 0.0037, + "step": 18800 + }, + { + "epoch": 10.526021264689424, + "grad_norm": 0.1180974468588829, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0039, + "step": 18810 + }, + { + "epoch": 10.531617235590375, + "grad_norm": 0.07416771352291107, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0041, + "step": 18820 + }, + { + "epoch": 10.537213206491327, + "grad_norm": 0.05668334290385246, + "learning_rate": 9.327159196481138e-07, + "loss": 0.0059, + "step": 18830 + }, + { + "epoch": 10.542809177392277, + "grad_norm": 0.07584750652313232, + "learning_rate": 9.168888953011989e-07, + "loss": 0.0054, + "step": 18840 + }, + { + "epoch": 10.548405148293229, + "grad_norm": 0.06703902035951614, + "learning_rate": 9.011960623058202e-07, + "loss": 0.0039, + "step": 18850 + }, + { + "epoch": 10.55400111919418, + "grad_norm": 0.06538796424865723, + "learning_rate": 8.856374635655695e-07, + "loss": 0.0035, + "step": 18860 + }, + { + "epoch": 10.559597090095131, + "grad_norm": 0.09234767407178879, + "learning_rate": 8.702131416170656e-07, + "loss": 0.0047, + "step": 18870 + }, + { + "epoch": 10.565193060996084, + "grad_norm": 0.09068552404642105, + "learning_rate": 8.549231386298151e-07, + "loss": 0.0032, + "step": 18880 + }, + { + "epoch": 10.570789031897034, + "grad_norm": 0.2574044466018677, + "learning_rate": 8.397674964061075e-07, + "loss": 0.0123, + "step": 18890 + }, + { + "epoch": 10.576385002797986, + "grad_norm": 0.1742398738861084, + "learning_rate": 8.247462563808817e-07, + "loss": 0.005, + "step": 18900 + }, + { + "epoch": 10.581980973698936, + "grad_norm": 0.19498533010482788, + "learning_rate": 8.098594596216424e-07, + "loss": 0.0051, + "step": 18910 + }, + { + "epoch": 10.587576944599888, + "grad_norm": 0.1093849390745163, + "learning_rate": 7.951071468283167e-07, + "loss": 0.0062, + "step": 18920 + }, + { + "epoch": 10.593172915500839, + "grad_norm": 0.05242215842008591, + "learning_rate": 7.804893583331696e-07, + "loss": 0.0049, + "step": 18930 + }, + { + "epoch": 10.59876888640179, + "grad_norm": 0.06830724328756332, + "learning_rate": 7.66006134100672e-07, + "loss": 0.0031, + "step": 18940 + }, + { + "epoch": 10.604364857302741, + "grad_norm": 0.08541436493396759, + "learning_rate": 7.516575137274162e-07, + "loss": 0.0044, + "step": 18950 + }, + { + "epoch": 10.609960828203693, + "grad_norm": 0.042029768228530884, + "learning_rate": 7.374435364419674e-07, + "loss": 0.0043, + "step": 18960 + }, + { + "epoch": 10.615556799104645, + "grad_norm": 0.12100391089916229, + "learning_rate": 7.233642411048014e-07, + "loss": 0.0032, + "step": 18970 + }, + { + "epoch": 10.621152770005596, + "grad_norm": 0.04842936620116234, + "learning_rate": 7.094196662081831e-07, + "loss": 0.0052, + "step": 18980 + }, + { + "epoch": 10.626748740906548, + "grad_norm": 0.13397961854934692, + "learning_rate": 6.956098498760389e-07, + "loss": 0.0056, + "step": 18990 + }, + { + "epoch": 10.632344711807498, + "grad_norm": 0.19486455619335175, + "learning_rate": 6.819348298638839e-07, + "loss": 0.0029, + "step": 19000 + }, + { + "epoch": 10.63794068270845, + "grad_norm": 0.1525876224040985, + "learning_rate": 6.683946435586952e-07, + "loss": 0.0142, + "step": 19010 + }, + { + "epoch": 10.6435366536094, + "grad_norm": 0.09059377759695053, + "learning_rate": 6.549893279788277e-07, + "loss": 0.0057, + "step": 19020 + }, + { + "epoch": 10.649132624510353, + "grad_norm": 0.08628048002719879, + "learning_rate": 6.417189197739093e-07, + "loss": 0.0059, + "step": 19030 + }, + { + "epoch": 10.654728595411305, + "grad_norm": 0.34853503108024597, + "learning_rate": 6.285834552247128e-07, + "loss": 0.0041, + "step": 19040 + }, + { + "epoch": 10.660324566312255, + "grad_norm": 0.1580825001001358, + "learning_rate": 6.15582970243117e-07, + "loss": 0.0059, + "step": 19050 + }, + { + "epoch": 10.665920537213207, + "grad_norm": 0.2064519226551056, + "learning_rate": 6.027175003719354e-07, + "loss": 0.0065, + "step": 19060 + }, + { + "epoch": 10.671516508114157, + "grad_norm": 0.1656566709280014, + "learning_rate": 5.899870807848762e-07, + "loss": 0.0045, + "step": 19070 + }, + { + "epoch": 10.67711247901511, + "grad_norm": 0.06346923857927322, + "learning_rate": 5.773917462864264e-07, + "loss": 0.0108, + "step": 19080 + }, + { + "epoch": 10.68270844991606, + "grad_norm": 0.0746588408946991, + "learning_rate": 5.64931531311741e-07, + "loss": 0.0038, + "step": 19090 + }, + { + "epoch": 10.688304420817012, + "grad_norm": 0.10566951334476471, + "learning_rate": 5.526064699265753e-07, + "loss": 0.0084, + "step": 19100 + }, + { + "epoch": 10.693900391717962, + "grad_norm": 0.061587151139974594, + "learning_rate": 5.404165958271811e-07, + "loss": 0.0042, + "step": 19110 + }, + { + "epoch": 10.699496362618914, + "grad_norm": 0.27593472599983215, + "learning_rate": 5.283619423401998e-07, + "loss": 0.005, + "step": 19120 + }, + { + "epoch": 10.705092333519866, + "grad_norm": 0.37827596068382263, + "learning_rate": 5.164425424226016e-07, + "loss": 0.0068, + "step": 19130 + }, + { + "epoch": 10.710688304420817, + "grad_norm": 0.2789309322834015, + "learning_rate": 5.046584286615697e-07, + "loss": 0.0054, + "step": 19140 + }, + { + "epoch": 10.716284275321769, + "grad_norm": 0.08417310565710068, + "learning_rate": 4.930096332744105e-07, + "loss": 0.0043, + "step": 19150 + }, + { + "epoch": 10.72188024622272, + "grad_norm": 0.13277283310890198, + "learning_rate": 4.814961881085045e-07, + "loss": 0.007, + "step": 19160 + }, + { + "epoch": 10.727476217123671, + "grad_norm": 0.029057292267680168, + "learning_rate": 4.701181246411501e-07, + "loss": 0.0077, + "step": 19170 + }, + { + "epoch": 10.733072188024622, + "grad_norm": 0.07132174074649811, + "learning_rate": 4.5887547397955864e-07, + "loss": 0.0044, + "step": 19180 + }, + { + "epoch": 10.738668158925574, + "grad_norm": 0.05213991925120354, + "learning_rate": 4.4776826686069305e-07, + "loss": 0.0022, + "step": 19190 + }, + { + "epoch": 10.744264129826526, + "grad_norm": 0.092039555311203, + "learning_rate": 4.367965336512403e-07, + "loss": 0.0032, + "step": 19200 + }, + { + "epoch": 10.749860100727476, + "grad_norm": 0.17352578043937683, + "learning_rate": 4.259603043475002e-07, + "loss": 0.0064, + "step": 19210 + }, + { + "epoch": 10.755456071628428, + "grad_norm": 0.15915948152542114, + "learning_rate": 4.1525960857530243e-07, + "loss": 0.0075, + "step": 19220 + }, + { + "epoch": 10.761052042529379, + "grad_norm": 0.21297423541545868, + "learning_rate": 4.0469447558995065e-07, + "loss": 0.0057, + "step": 19230 + }, + { + "epoch": 10.76664801343033, + "grad_norm": 0.17462663352489471, + "learning_rate": 3.9426493427611177e-07, + "loss": 0.0056, + "step": 19240 + }, + { + "epoch": 10.772243984331281, + "grad_norm": 0.10657753050327301, + "learning_rate": 3.839710131477492e-07, + "loss": 0.0089, + "step": 19250 + }, + { + "epoch": 10.777839955232233, + "grad_norm": 0.07254552841186523, + "learning_rate": 3.738127403480507e-07, + "loss": 0.003, + "step": 19260 + }, + { + "epoch": 10.783435926133183, + "grad_norm": 0.27843359112739563, + "learning_rate": 3.637901436493507e-07, + "loss": 0.0067, + "step": 19270 + }, + { + "epoch": 10.789031897034135, + "grad_norm": 0.17431190609931946, + "learning_rate": 3.5390325045304706e-07, + "loss": 0.0042, + "step": 19280 + }, + { + "epoch": 10.794627867935088, + "grad_norm": 0.11761761456727982, + "learning_rate": 3.441520877895288e-07, + "loss": 0.0036, + "step": 19290 + }, + { + "epoch": 10.800223838836038, + "grad_norm": 0.1055087074637413, + "learning_rate": 3.3453668231809286e-07, + "loss": 0.0049, + "step": 19300 + }, + { + "epoch": 10.80581980973699, + "grad_norm": 0.05716053023934364, + "learning_rate": 3.250570603268943e-07, + "loss": 0.0057, + "step": 19310 + }, + { + "epoch": 10.81141578063794, + "grad_norm": 0.06227661669254303, + "learning_rate": 3.157132477328628e-07, + "loss": 0.0047, + "step": 19320 + }, + { + "epoch": 10.817011751538892, + "grad_norm": 0.07587496936321259, + "learning_rate": 3.0650527008162513e-07, + "loss": 0.0058, + "step": 19330 + }, + { + "epoch": 10.822607722439843, + "grad_norm": 0.12384708225727081, + "learning_rate": 2.9743315254743833e-07, + "loss": 0.0044, + "step": 19340 + }, + { + "epoch": 10.828203693340795, + "grad_norm": 0.130027636885643, + "learning_rate": 2.8849691993311777e-07, + "loss": 0.0048, + "step": 19350 + }, + { + "epoch": 10.833799664241745, + "grad_norm": 0.03498604893684387, + "learning_rate": 2.796965966699927e-07, + "loss": 0.0076, + "step": 19360 + }, + { + "epoch": 10.839395635142697, + "grad_norm": 0.06795532256364822, + "learning_rate": 2.7103220681780615e-07, + "loss": 0.0046, + "step": 19370 + }, + { + "epoch": 10.84499160604365, + "grad_norm": 0.15649089217185974, + "learning_rate": 2.625037740646763e-07, + "loss": 0.0041, + "step": 19380 + }, + { + "epoch": 10.8505875769446, + "grad_norm": 0.19872230291366577, + "learning_rate": 2.5411132172700194e-07, + "loss": 0.0045, + "step": 19390 + }, + { + "epoch": 10.856183547845552, + "grad_norm": 0.1986837238073349, + "learning_rate": 2.458548727494292e-07, + "loss": 0.0034, + "step": 19400 + }, + { + "epoch": 10.861779518746502, + "grad_norm": 0.34645870327949524, + "learning_rate": 2.3773444970477955e-07, + "loss": 0.0059, + "step": 19410 + }, + { + "epoch": 10.867375489647454, + "grad_norm": 0.043271441012620926, + "learning_rate": 2.2975007479397738e-07, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 10.872971460548404, + "grad_norm": 0.10621374845504761, + "learning_rate": 2.219017698460002e-07, + "loss": 0.0107, + "step": 19430 + }, + { + "epoch": 10.878567431449357, + "grad_norm": 0.038412097841501236, + "learning_rate": 2.1418955631781202e-07, + "loss": 0.0025, + "step": 19440 + }, + { + "epoch": 10.884163402350307, + "grad_norm": 0.14375977218151093, + "learning_rate": 2.0661345529430775e-07, + "loss": 0.0063, + "step": 19450 + }, + { + "epoch": 10.889759373251259, + "grad_norm": 0.28644490242004395, + "learning_rate": 1.9917348748826335e-07, + "loss": 0.0037, + "step": 19460 + }, + { + "epoch": 10.895355344152211, + "grad_norm": 0.19371145963668823, + "learning_rate": 1.918696732402636e-07, + "loss": 0.0071, + "step": 19470 + }, + { + "epoch": 10.900951315053161, + "grad_norm": 0.11907006055116653, + "learning_rate": 1.847020325186577e-07, + "loss": 0.0049, + "step": 19480 + }, + { + "epoch": 10.906547285954113, + "grad_norm": 0.10020023584365845, + "learning_rate": 1.776705849195037e-07, + "loss": 0.0036, + "step": 19490 + }, + { + "epoch": 10.912143256855064, + "grad_norm": 0.12778791785240173, + "learning_rate": 1.7077534966650766e-07, + "loss": 0.0057, + "step": 19500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.914238548376065e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-20000/config.json b/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/checkpoint-20000/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/checkpoint-20000/experiment_cfg/metadata.json b/checkpoint-20000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/checkpoint-20000/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-20000/model-00001-of-00002.safetensors b/checkpoint-20000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1af3c78faba3d8e1dafcb8fae5aca98c3d8a2ff8 --- /dev/null +++ b/checkpoint-20000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868027eb73128c8a91a9183c79c269dded0b87001a41f4836fb8fb945f17f3ee +size 4938446392 diff --git a/checkpoint-20000/model-00002-of-00002.safetensors b/checkpoint-20000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6eb984a90e36390d9452da71b260ffe74d62da9f --- /dev/null +++ b/checkpoint-20000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdcd5c4024fdfc3f6132554a06dc56cf2529b6e88cf5cd377def158edaf9b1f8 +size 3821736024 diff --git a/checkpoint-20000/model.safetensors.index.json b/checkpoint-20000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/checkpoint-20000/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/checkpoint-20000/optimizer.pt b/checkpoint-20000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..17d2a7e2892d5074106d206605278330ef84426f --- /dev/null +++ b/checkpoint-20000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a847e1f5e1b6a48a836c727c9bc4e8cf97b8a030556e2e112ea933d279827fae +size 10272357262 diff --git a/checkpoint-20000/rng_state.pth b/checkpoint-20000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..09109423d2c6310f78da9ceaa79de5ea6d8a8343 --- /dev/null +++ b/checkpoint-20000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524112d7de8815c25e761e10f21c75da9bdc16221bccd49f1bd3dea35fcff893 +size 14244 diff --git a/checkpoint-20000/scheduler.pt b/checkpoint-20000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..29e18cd68ffa7df8667f63f0f82f83adce3b7e47 --- /dev/null +++ b/checkpoint-20000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96dedd731c018af5121e5a6273b315a5ea44ce825ed023675cb4a9716b4911be +size 1064 diff --git a/checkpoint-20000/trainer_state.json b/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1f0cdc63f8081af07f20fefb333afdaf85f8c1b --- /dev/null +++ b/checkpoint-20000/trainer_state.json @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.19194180190263, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + }, + { + "epoch": 10.078343592613319, + "grad_norm": 0.07472005486488342, + "learning_rate": 2.682358534298285e-06, + "loss": 0.006, + "step": 18010 + }, + { + "epoch": 10.083939563514269, + "grad_norm": 0.09040942043066025, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.004, + "step": 18020 + }, + { + "epoch": 10.089535534415221, + "grad_norm": 0.037220001220703125, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0038, + "step": 18030 + }, + { + "epoch": 10.095131505316173, + "grad_norm": 0.11240635067224503, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0031, + "step": 18040 + }, + { + "epoch": 10.100727476217124, + "grad_norm": 0.12259605526924133, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0049, + "step": 18050 + }, + { + "epoch": 10.106323447118076, + "grad_norm": 0.16758129000663757, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0066, + "step": 18060 + }, + { + "epoch": 10.111919418019026, + "grad_norm": 0.10704974085092545, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.0021, + "step": 18070 + }, + { + "epoch": 10.117515388919978, + "grad_norm": 0.19684171676635742, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0035, + "step": 18080 + }, + { + "epoch": 10.123111359820928, + "grad_norm": 0.25069093704223633, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0038, + "step": 18090 + }, + { + "epoch": 10.12870733072188, + "grad_norm": 0.15888355672359467, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0055, + "step": 18100 + }, + { + "epoch": 10.13430330162283, + "grad_norm": 0.1355513483285904, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0032, + "step": 18110 + }, + { + "epoch": 10.139899272523783, + "grad_norm": 0.09521888941526413, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0072, + "step": 18120 + }, + { + "epoch": 10.145495243424735, + "grad_norm": 0.18774038553237915, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0028, + "step": 18130 + }, + { + "epoch": 10.151091214325685, + "grad_norm": 0.06293562054634094, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0032, + "step": 18140 + }, + { + "epoch": 10.156687185226637, + "grad_norm": 0.25169095396995544, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0092, + "step": 18150 + }, + { + "epoch": 10.162283156127588, + "grad_norm": 0.08376752585172653, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0021, + "step": 18160 + }, + { + "epoch": 10.16787912702854, + "grad_norm": 0.06758670508861542, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 10.17347509792949, + "grad_norm": 0.06193256378173828, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0036, + "step": 18180 + }, + { + "epoch": 10.179071068830442, + "grad_norm": 0.21087805926799774, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0043, + "step": 18190 + }, + { + "epoch": 10.184667039731393, + "grad_norm": 0.09581877291202545, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0048, + "step": 18200 + }, + { + "epoch": 10.190263010632345, + "grad_norm": 0.23187117278575897, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0068, + "step": 18210 + }, + { + "epoch": 10.195858981533297, + "grad_norm": 0.1904383897781372, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0036, + "step": 18220 + }, + { + "epoch": 10.201454952434247, + "grad_norm": 0.04588289558887482, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0075, + "step": 18230 + }, + { + "epoch": 10.2070509233352, + "grad_norm": 0.05446457862854004, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 10.21264689423615, + "grad_norm": 0.12907229363918304, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 10.218242865137102, + "grad_norm": 0.0724627822637558, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0024, + "step": 18260 + }, + { + "epoch": 10.223838836038052, + "grad_norm": 0.11797565221786499, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0032, + "step": 18270 + }, + { + "epoch": 10.229434806939004, + "grad_norm": 0.1239556148648262, + "learning_rate": 2.008446509461498e-06, + "loss": 0.0038, + "step": 18280 + }, + { + "epoch": 10.235030777839956, + "grad_norm": 0.05614084377884865, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0041, + "step": 18290 + }, + { + "epoch": 10.240626748740906, + "grad_norm": 0.2968387007713318, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0092, + "step": 18300 + }, + { + "epoch": 10.246222719641858, + "grad_norm": 0.11529407650232315, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0059, + "step": 18310 + }, + { + "epoch": 10.251818690542809, + "grad_norm": 0.24037353694438934, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0045, + "step": 18320 + }, + { + "epoch": 10.257414661443761, + "grad_norm": 0.20346900820732117, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 10.263010632344711, + "grad_norm": 0.27883380651474, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0049, + "step": 18340 + }, + { + "epoch": 10.268606603245663, + "grad_norm": 0.12232355028390884, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0077, + "step": 18350 + }, + { + "epoch": 10.274202574146614, + "grad_norm": 0.09397400170564651, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0039, + "step": 18360 + }, + { + "epoch": 10.279798545047566, + "grad_norm": 0.13843244314193726, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.0059, + "step": 18370 + }, + { + "epoch": 10.285394515948518, + "grad_norm": 0.04927824065089226, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.003, + "step": 18380 + }, + { + "epoch": 10.290990486849468, + "grad_norm": 0.2848436236381531, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.004, + "step": 18390 + }, + { + "epoch": 10.29658645775042, + "grad_norm": 0.10808296501636505, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0072, + "step": 18400 + }, + { + "epoch": 10.30218242865137, + "grad_norm": 0.08363109827041626, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0029, + "step": 18410 + }, + { + "epoch": 10.307778399552323, + "grad_norm": 0.07970738410949707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0058, + "step": 18420 + }, + { + "epoch": 10.313374370453273, + "grad_norm": 0.06155739724636078, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0042, + "step": 18430 + }, + { + "epoch": 10.318970341354225, + "grad_norm": 0.15518154203891754, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.006, + "step": 18440 + }, + { + "epoch": 10.324566312255175, + "grad_norm": 0.06478218734264374, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0083, + "step": 18450 + }, + { + "epoch": 10.330162283156128, + "grad_norm": 0.11871203780174255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 10.33575825405708, + "grad_norm": 0.13164940476417542, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 10.34135422495803, + "grad_norm": 0.08165992051362991, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0065, + "step": 18480 + }, + { + "epoch": 10.346950195858982, + "grad_norm": 0.06479761004447937, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0044, + "step": 18490 + }, + { + "epoch": 10.352546166759932, + "grad_norm": 0.24080127477645874, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0085, + "step": 18500 + }, + { + "epoch": 10.358142137660884, + "grad_norm": 0.1411421000957489, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0045, + "step": 18510 + }, + { + "epoch": 10.363738108561835, + "grad_norm": 0.058580052107572556, + "learning_rate": 1.4896615227983468e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 10.369334079462787, + "grad_norm": 0.1638147383928299, + "learning_rate": 1.4696978433290653e-06, + "loss": 0.0054, + "step": 18530 + }, + { + "epoch": 10.374930050363739, + "grad_norm": 0.05566524341702461, + "learning_rate": 1.4498668438527597e-06, + "loss": 0.004, + "step": 18540 + }, + { + "epoch": 10.38052602126469, + "grad_norm": 0.07601140439510345, + "learning_rate": 1.4301685785866214e-06, + "loss": 0.0034, + "step": 18550 + }, + { + "epoch": 10.386121992165641, + "grad_norm": 0.10449633747339249, + "learning_rate": 1.4106031013849496e-06, + "loss": 0.0041, + "step": 18560 + }, + { + "epoch": 10.391717963066592, + "grad_norm": 0.15937356650829315, + "learning_rate": 1.3911704657390113e-06, + "loss": 0.0039, + "step": 18570 + }, + { + "epoch": 10.397313933967544, + "grad_norm": 0.059475306421518326, + "learning_rate": 1.3718707247769135e-06, + "loss": 0.006, + "step": 18580 + }, + { + "epoch": 10.402909904868494, + "grad_norm": 0.24354378879070282, + "learning_rate": 1.3527039312633827e-06, + "loss": 0.0042, + "step": 18590 + }, + { + "epoch": 10.408505875769446, + "grad_norm": 0.20878778398036957, + "learning_rate": 1.333670137599713e-06, + "loss": 0.0107, + "step": 18600 + }, + { + "epoch": 10.414101846670397, + "grad_norm": 0.1909496784210205, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0034, + "step": 18610 + }, + { + "epoch": 10.419697817571349, + "grad_norm": 0.13632823526859283, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0066, + "step": 18620 + }, + { + "epoch": 10.4252937884723, + "grad_norm": 0.10793755203485489, + "learning_rate": 1.2773672742655784e-06, + "loss": 0.0037, + "step": 18630 + }, + { + "epoch": 10.430889759373251, + "grad_norm": 0.10346037149429321, + "learning_rate": 1.2588659967397e-06, + "loss": 0.0044, + "step": 18640 + }, + { + "epoch": 10.436485730274203, + "grad_norm": 0.08834080398082733, + "learning_rate": 1.2404979756130142e-06, + "loss": 0.0037, + "step": 18650 + }, + { + "epoch": 10.442081701175153, + "grad_norm": 0.09045784175395966, + "learning_rate": 1.222263261102985e-06, + "loss": 0.0052, + "step": 18660 + }, + { + "epoch": 10.447677672076106, + "grad_norm": 0.07731129229068756, + "learning_rate": 1.2041619030626284e-06, + "loss": 0.0071, + "step": 18670 + }, + { + "epoch": 10.453273642977056, + "grad_norm": 0.08769071102142334, + "learning_rate": 1.1861939509803687e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 10.458869613878008, + "grad_norm": 0.15766629576683044, + "learning_rate": 1.1683594539798893e-06, + "loss": 0.0063, + "step": 18690 + }, + { + "epoch": 10.46446558477896, + "grad_norm": 0.11048921942710876, + "learning_rate": 1.1506584608200367e-06, + "loss": 0.0033, + "step": 18700 + }, + { + "epoch": 10.47006155567991, + "grad_norm": 0.25674813985824585, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0047, + "step": 18710 + }, + { + "epoch": 10.475657526580862, + "grad_norm": 0.09696432203054428, + "learning_rate": 1.1156571792324211e-06, + "loss": 0.0038, + "step": 18720 + }, + { + "epoch": 10.481253497481813, + "grad_norm": 0.17716100811958313, + "learning_rate": 1.0983569864968346e-06, + "loss": 0.0085, + "step": 18730 + }, + { + "epoch": 10.486849468382765, + "grad_norm": 0.18763263523578644, + "learning_rate": 1.0811904889859336e-06, + "loss": 0.009, + "step": 18740 + }, + { + "epoch": 10.492445439283715, + "grad_norm": 0.047968145459890366, + "learning_rate": 1.064157733632276e-06, + "loss": 0.0051, + "step": 18750 + }, + { + "epoch": 10.498041410184667, + "grad_norm": 0.1565999537706375, + "learning_rate": 1.0472587670027678e-06, + "loss": 0.0062, + "step": 18760 + }, + { + "epoch": 10.503637381085618, + "grad_norm": 0.06519567221403122, + "learning_rate": 1.030493635298535e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 10.50923335198657, + "grad_norm": 0.10364692658185959, + "learning_rate": 1.0138623843548078e-06, + "loss": 0.0051, + "step": 18780 + }, + { + "epoch": 10.514829322887522, + "grad_norm": 0.036633651703596115, + "learning_rate": 9.97365059640787e-07, + "loss": 0.0062, + "step": 18790 + }, + { + "epoch": 10.520425293788472, + "grad_norm": 0.2015930861234665, + "learning_rate": 9.810017062595322e-07, + "loss": 0.0037, + "step": 18800 + }, + { + "epoch": 10.526021264689424, + "grad_norm": 0.1180974468588829, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0039, + "step": 18810 + }, + { + "epoch": 10.531617235590375, + "grad_norm": 0.07416771352291107, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0041, + "step": 18820 + }, + { + "epoch": 10.537213206491327, + "grad_norm": 0.05668334290385246, + "learning_rate": 9.327159196481138e-07, + "loss": 0.0059, + "step": 18830 + }, + { + "epoch": 10.542809177392277, + "grad_norm": 0.07584750652313232, + "learning_rate": 9.168888953011989e-07, + "loss": 0.0054, + "step": 18840 + }, + { + "epoch": 10.548405148293229, + "grad_norm": 0.06703902035951614, + "learning_rate": 9.011960623058202e-07, + "loss": 0.0039, + "step": 18850 + }, + { + "epoch": 10.55400111919418, + "grad_norm": 0.06538796424865723, + "learning_rate": 8.856374635655695e-07, + "loss": 0.0035, + "step": 18860 + }, + { + "epoch": 10.559597090095131, + "grad_norm": 0.09234767407178879, + "learning_rate": 8.702131416170656e-07, + "loss": 0.0047, + "step": 18870 + }, + { + "epoch": 10.565193060996084, + "grad_norm": 0.09068552404642105, + "learning_rate": 8.549231386298151e-07, + "loss": 0.0032, + "step": 18880 + }, + { + "epoch": 10.570789031897034, + "grad_norm": 0.2574044466018677, + "learning_rate": 8.397674964061075e-07, + "loss": 0.0123, + "step": 18890 + }, + { + "epoch": 10.576385002797986, + "grad_norm": 0.1742398738861084, + "learning_rate": 8.247462563808817e-07, + "loss": 0.005, + "step": 18900 + }, + { + "epoch": 10.581980973698936, + "grad_norm": 0.19498533010482788, + "learning_rate": 8.098594596216424e-07, + "loss": 0.0051, + "step": 18910 + }, + { + "epoch": 10.587576944599888, + "grad_norm": 0.1093849390745163, + "learning_rate": 7.951071468283167e-07, + "loss": 0.0062, + "step": 18920 + }, + { + "epoch": 10.593172915500839, + "grad_norm": 0.05242215842008591, + "learning_rate": 7.804893583331696e-07, + "loss": 0.0049, + "step": 18930 + }, + { + "epoch": 10.59876888640179, + "grad_norm": 0.06830724328756332, + "learning_rate": 7.66006134100672e-07, + "loss": 0.0031, + "step": 18940 + }, + { + "epoch": 10.604364857302741, + "grad_norm": 0.08541436493396759, + "learning_rate": 7.516575137274162e-07, + "loss": 0.0044, + "step": 18950 + }, + { + "epoch": 10.609960828203693, + "grad_norm": 0.042029768228530884, + "learning_rate": 7.374435364419674e-07, + "loss": 0.0043, + "step": 18960 + }, + { + "epoch": 10.615556799104645, + "grad_norm": 0.12100391089916229, + "learning_rate": 7.233642411048014e-07, + "loss": 0.0032, + "step": 18970 + }, + { + "epoch": 10.621152770005596, + "grad_norm": 0.04842936620116234, + "learning_rate": 7.094196662081831e-07, + "loss": 0.0052, + "step": 18980 + }, + { + "epoch": 10.626748740906548, + "grad_norm": 0.13397961854934692, + "learning_rate": 6.956098498760389e-07, + "loss": 0.0056, + "step": 18990 + }, + { + "epoch": 10.632344711807498, + "grad_norm": 0.19486455619335175, + "learning_rate": 6.819348298638839e-07, + "loss": 0.0029, + "step": 19000 + }, + { + "epoch": 10.63794068270845, + "grad_norm": 0.1525876224040985, + "learning_rate": 6.683946435586952e-07, + "loss": 0.0142, + "step": 19010 + }, + { + "epoch": 10.6435366536094, + "grad_norm": 0.09059377759695053, + "learning_rate": 6.549893279788277e-07, + "loss": 0.0057, + "step": 19020 + }, + { + "epoch": 10.649132624510353, + "grad_norm": 0.08628048002719879, + "learning_rate": 6.417189197739093e-07, + "loss": 0.0059, + "step": 19030 + }, + { + "epoch": 10.654728595411305, + "grad_norm": 0.34853503108024597, + "learning_rate": 6.285834552247128e-07, + "loss": 0.0041, + "step": 19040 + }, + { + "epoch": 10.660324566312255, + "grad_norm": 0.1580825001001358, + "learning_rate": 6.15582970243117e-07, + "loss": 0.0059, + "step": 19050 + }, + { + "epoch": 10.665920537213207, + "grad_norm": 0.2064519226551056, + "learning_rate": 6.027175003719354e-07, + "loss": 0.0065, + "step": 19060 + }, + { + "epoch": 10.671516508114157, + "grad_norm": 0.1656566709280014, + "learning_rate": 5.899870807848762e-07, + "loss": 0.0045, + "step": 19070 + }, + { + "epoch": 10.67711247901511, + "grad_norm": 0.06346923857927322, + "learning_rate": 5.773917462864264e-07, + "loss": 0.0108, + "step": 19080 + }, + { + "epoch": 10.68270844991606, + "grad_norm": 0.0746588408946991, + "learning_rate": 5.64931531311741e-07, + "loss": 0.0038, + "step": 19090 + }, + { + "epoch": 10.688304420817012, + "grad_norm": 0.10566951334476471, + "learning_rate": 5.526064699265753e-07, + "loss": 0.0084, + "step": 19100 + }, + { + "epoch": 10.693900391717962, + "grad_norm": 0.061587151139974594, + "learning_rate": 5.404165958271811e-07, + "loss": 0.0042, + "step": 19110 + }, + { + "epoch": 10.699496362618914, + "grad_norm": 0.27593472599983215, + "learning_rate": 5.283619423401998e-07, + "loss": 0.005, + "step": 19120 + }, + { + "epoch": 10.705092333519866, + "grad_norm": 0.37827596068382263, + "learning_rate": 5.164425424226016e-07, + "loss": 0.0068, + "step": 19130 + }, + { + "epoch": 10.710688304420817, + "grad_norm": 0.2789309322834015, + "learning_rate": 5.046584286615697e-07, + "loss": 0.0054, + "step": 19140 + }, + { + "epoch": 10.716284275321769, + "grad_norm": 0.08417310565710068, + "learning_rate": 4.930096332744105e-07, + "loss": 0.0043, + "step": 19150 + }, + { + "epoch": 10.72188024622272, + "grad_norm": 0.13277283310890198, + "learning_rate": 4.814961881085045e-07, + "loss": 0.007, + "step": 19160 + }, + { + "epoch": 10.727476217123671, + "grad_norm": 0.029057292267680168, + "learning_rate": 4.701181246411501e-07, + "loss": 0.0077, + "step": 19170 + }, + { + "epoch": 10.733072188024622, + "grad_norm": 0.07132174074649811, + "learning_rate": 4.5887547397955864e-07, + "loss": 0.0044, + "step": 19180 + }, + { + "epoch": 10.738668158925574, + "grad_norm": 0.05213991925120354, + "learning_rate": 4.4776826686069305e-07, + "loss": 0.0022, + "step": 19190 + }, + { + "epoch": 10.744264129826526, + "grad_norm": 0.092039555311203, + "learning_rate": 4.367965336512403e-07, + "loss": 0.0032, + "step": 19200 + }, + { + "epoch": 10.749860100727476, + "grad_norm": 0.17352578043937683, + "learning_rate": 4.259603043475002e-07, + "loss": 0.0064, + "step": 19210 + }, + { + "epoch": 10.755456071628428, + "grad_norm": 0.15915948152542114, + "learning_rate": 4.1525960857530243e-07, + "loss": 0.0075, + "step": 19220 + }, + { + "epoch": 10.761052042529379, + "grad_norm": 0.21297423541545868, + "learning_rate": 4.0469447558995065e-07, + "loss": 0.0057, + "step": 19230 + }, + { + "epoch": 10.76664801343033, + "grad_norm": 0.17462663352489471, + "learning_rate": 3.9426493427611177e-07, + "loss": 0.0056, + "step": 19240 + }, + { + "epoch": 10.772243984331281, + "grad_norm": 0.10657753050327301, + "learning_rate": 3.839710131477492e-07, + "loss": 0.0089, + "step": 19250 + }, + { + "epoch": 10.777839955232233, + "grad_norm": 0.07254552841186523, + "learning_rate": 3.738127403480507e-07, + "loss": 0.003, + "step": 19260 + }, + { + "epoch": 10.783435926133183, + "grad_norm": 0.27843359112739563, + "learning_rate": 3.637901436493507e-07, + "loss": 0.0067, + "step": 19270 + }, + { + "epoch": 10.789031897034135, + "grad_norm": 0.17431190609931946, + "learning_rate": 3.5390325045304706e-07, + "loss": 0.0042, + "step": 19280 + }, + { + "epoch": 10.794627867935088, + "grad_norm": 0.11761761456727982, + "learning_rate": 3.441520877895288e-07, + "loss": 0.0036, + "step": 19290 + }, + { + "epoch": 10.800223838836038, + "grad_norm": 0.1055087074637413, + "learning_rate": 3.3453668231809286e-07, + "loss": 0.0049, + "step": 19300 + }, + { + "epoch": 10.80581980973699, + "grad_norm": 0.05716053023934364, + "learning_rate": 3.250570603268943e-07, + "loss": 0.0057, + "step": 19310 + }, + { + "epoch": 10.81141578063794, + "grad_norm": 0.06227661669254303, + "learning_rate": 3.157132477328628e-07, + "loss": 0.0047, + "step": 19320 + }, + { + "epoch": 10.817011751538892, + "grad_norm": 0.07587496936321259, + "learning_rate": 3.0650527008162513e-07, + "loss": 0.0058, + "step": 19330 + }, + { + "epoch": 10.822607722439843, + "grad_norm": 0.12384708225727081, + "learning_rate": 2.9743315254743833e-07, + "loss": 0.0044, + "step": 19340 + }, + { + "epoch": 10.828203693340795, + "grad_norm": 0.130027636885643, + "learning_rate": 2.8849691993311777e-07, + "loss": 0.0048, + "step": 19350 + }, + { + "epoch": 10.833799664241745, + "grad_norm": 0.03498604893684387, + "learning_rate": 2.796965966699927e-07, + "loss": 0.0076, + "step": 19360 + }, + { + "epoch": 10.839395635142697, + "grad_norm": 0.06795532256364822, + "learning_rate": 2.7103220681780615e-07, + "loss": 0.0046, + "step": 19370 + }, + { + "epoch": 10.84499160604365, + "grad_norm": 0.15649089217185974, + "learning_rate": 2.625037740646763e-07, + "loss": 0.0041, + "step": 19380 + }, + { + "epoch": 10.8505875769446, + "grad_norm": 0.19872230291366577, + "learning_rate": 2.5411132172700194e-07, + "loss": 0.0045, + "step": 19390 + }, + { + "epoch": 10.856183547845552, + "grad_norm": 0.1986837238073349, + "learning_rate": 2.458548727494292e-07, + "loss": 0.0034, + "step": 19400 + }, + { + "epoch": 10.861779518746502, + "grad_norm": 0.34645870327949524, + "learning_rate": 2.3773444970477955e-07, + "loss": 0.0059, + "step": 19410 + }, + { + "epoch": 10.867375489647454, + "grad_norm": 0.043271441012620926, + "learning_rate": 2.2975007479397738e-07, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 10.872971460548404, + "grad_norm": 0.10621374845504761, + "learning_rate": 2.219017698460002e-07, + "loss": 0.0107, + "step": 19430 + }, + { + "epoch": 10.878567431449357, + "grad_norm": 0.038412097841501236, + "learning_rate": 2.1418955631781202e-07, + "loss": 0.0025, + "step": 19440 + }, + { + "epoch": 10.884163402350307, + "grad_norm": 0.14375977218151093, + "learning_rate": 2.0661345529430775e-07, + "loss": 0.0063, + "step": 19450 + }, + { + "epoch": 10.889759373251259, + "grad_norm": 0.28644490242004395, + "learning_rate": 1.9917348748826335e-07, + "loss": 0.0037, + "step": 19460 + }, + { + "epoch": 10.895355344152211, + "grad_norm": 0.19371145963668823, + "learning_rate": 1.918696732402636e-07, + "loss": 0.0071, + "step": 19470 + }, + { + "epoch": 10.900951315053161, + "grad_norm": 0.11907006055116653, + "learning_rate": 1.847020325186577e-07, + "loss": 0.0049, + "step": 19480 + }, + { + "epoch": 10.906547285954113, + "grad_norm": 0.10020023584365845, + "learning_rate": 1.776705849195037e-07, + "loss": 0.0036, + "step": 19490 + }, + { + "epoch": 10.912143256855064, + "grad_norm": 0.12778791785240173, + "learning_rate": 1.7077534966650766e-07, + "loss": 0.0057, + "step": 19500 + }, + { + "epoch": 10.917739227756016, + "grad_norm": 0.06359223276376724, + "learning_rate": 1.6401634561098444e-07, + "loss": 0.0036, + "step": 19510 + }, + { + "epoch": 10.923335198656966, + "grad_norm": 0.07983513921499252, + "learning_rate": 1.5739359123178587e-07, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 10.928931169557918, + "grad_norm": 0.12060696631669998, + "learning_rate": 1.5090710463527836e-07, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 10.93452714045887, + "grad_norm": 0.10252276062965393, + "learning_rate": 1.4455690355525964e-07, + "loss": 0.0052, + "step": 19540 + }, + { + "epoch": 10.94012311135982, + "grad_norm": 0.10586907714605331, + "learning_rate": 1.383430053529422e-07, + "loss": 0.0025, + "step": 19550 + }, + { + "epoch": 10.945719082260773, + "grad_norm": 0.05571618303656578, + "learning_rate": 1.3226542701689215e-07, + "loss": 0.0045, + "step": 19560 + }, + { + "epoch": 10.951315053161723, + "grad_norm": 0.07698628306388855, + "learning_rate": 1.2632418516296262e-07, + "loss": 0.0039, + "step": 19570 + }, + { + "epoch": 10.956911024062675, + "grad_norm": 0.3049318790435791, + "learning_rate": 1.2051929603428825e-07, + "loss": 0.0036, + "step": 19580 + }, + { + "epoch": 10.962506994963626, + "grad_norm": 0.04247491434216499, + "learning_rate": 1.1485077550122402e-07, + "loss": 0.0086, + "step": 19590 + }, + { + "epoch": 10.968102965864578, + "grad_norm": 0.13998843729496002, + "learning_rate": 1.0931863906127327e-07, + "loss": 0.0032, + "step": 19600 + }, + { + "epoch": 10.973698936765528, + "grad_norm": 0.18532228469848633, + "learning_rate": 1.0392290183909304e-07, + "loss": 0.0053, + "step": 19610 + }, + { + "epoch": 10.97929490766648, + "grad_norm": 0.24849370121955872, + "learning_rate": 9.866357858642205e-08, + "loss": 0.0031, + "step": 19620 + }, + { + "epoch": 10.984890878567432, + "grad_norm": 0.04739070311188698, + "learning_rate": 9.354068368204739e-08, + "loss": 0.0055, + "step": 19630 + }, + { + "epoch": 10.990486849468383, + "grad_norm": 0.13325341045856476, + "learning_rate": 8.855423113177664e-08, + "loss": 0.0027, + "step": 19640 + }, + { + "epoch": 10.996082820369335, + "grad_norm": 0.15442515909671783, + "learning_rate": 8.37042345683714e-08, + "loss": 0.009, + "step": 19650 + }, + { + "epoch": 11.001678791270285, + "grad_norm": 0.20657239854335785, + "learning_rate": 7.899070725153613e-08, + "loss": 0.0063, + "step": 19660 + }, + { + "epoch": 11.007274762171237, + "grad_norm": 0.16029535233974457, + "learning_rate": 7.44136620678848e-08, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 11.012870733072187, + "grad_norm": 0.16476546227931976, + "learning_rate": 6.997311153086883e-08, + "loss": 0.0066, + "step": 19680 + }, + { + "epoch": 11.01846670397314, + "grad_norm": 0.12683425843715668, + "learning_rate": 6.566906778079917e-08, + "loss": 0.0052, + "step": 19690 + }, + { + "epoch": 11.024062674874092, + "grad_norm": 0.23135153949260712, + "learning_rate": 6.150154258476315e-08, + "loss": 0.0043, + "step": 19700 + }, + { + "epoch": 11.029658645775042, + "grad_norm": 0.1939716786146164, + "learning_rate": 5.747054733660773e-08, + "loss": 0.0077, + "step": 19710 + }, + { + "epoch": 11.035254616675994, + "grad_norm": 0.11450741440057755, + "learning_rate": 5.3576093056922906e-08, + "loss": 0.0079, + "step": 19720 + }, + { + "epoch": 11.040850587576944, + "grad_norm": 0.06929726153612137, + "learning_rate": 4.981819039300284e-08, + "loss": 0.0039, + "step": 19730 + }, + { + "epoch": 11.046446558477896, + "grad_norm": 0.11268885433673859, + "learning_rate": 4.619684961881254e-08, + "loss": 0.0047, + "step": 19740 + }, + { + "epoch": 11.052042529378847, + "grad_norm": 0.07555661350488663, + "learning_rate": 4.2712080634949024e-08, + "loss": 0.0038, + "step": 19750 + }, + { + "epoch": 11.057638500279799, + "grad_norm": 0.07180225849151611, + "learning_rate": 3.936389296864129e-08, + "loss": 0.0066, + "step": 19760 + }, + { + "epoch": 11.063234471180749, + "grad_norm": 0.2635197937488556, + "learning_rate": 3.615229577371149e-08, + "loss": 0.0047, + "step": 19770 + }, + { + "epoch": 11.068830442081701, + "grad_norm": 0.03527739644050598, + "learning_rate": 3.3077297830541584e-08, + "loss": 0.0047, + "step": 19780 + }, + { + "epoch": 11.074426412982653, + "grad_norm": 0.061606280505657196, + "learning_rate": 3.01389075460512e-08, + "loss": 0.0069, + "step": 19790 + }, + { + "epoch": 11.080022383883604, + "grad_norm": 0.14764872193336487, + "learning_rate": 2.7337132953697554e-08, + "loss": 0.0063, + "step": 19800 + }, + { + "epoch": 11.085618354784556, + "grad_norm": 0.13825170695781708, + "learning_rate": 2.467198171342e-08, + "loss": 0.0047, + "step": 19810 + }, + { + "epoch": 11.091214325685506, + "grad_norm": 0.40132373571395874, + "learning_rate": 2.214346111164556e-08, + "loss": 0.0058, + "step": 19820 + }, + { + "epoch": 11.096810296586458, + "grad_norm": 0.06293044239282608, + "learning_rate": 1.9751578061244504e-08, + "loss": 0.0093, + "step": 19830 + }, + { + "epoch": 11.102406267487408, + "grad_norm": 0.08641501516103745, + "learning_rate": 1.749633910153592e-08, + "loss": 0.0061, + "step": 19840 + }, + { + "epoch": 11.10800223838836, + "grad_norm": 0.06543342024087906, + "learning_rate": 1.5377750398265502e-08, + "loss": 0.0034, + "step": 19850 + }, + { + "epoch": 11.11359820928931, + "grad_norm": 0.0463268905878067, + "learning_rate": 1.3395817743561134e-08, + "loss": 0.0031, + "step": 19860 + }, + { + "epoch": 11.119194180190263, + "grad_norm": 0.18889687955379486, + "learning_rate": 1.1550546555960662e-08, + "loss": 0.0049, + "step": 19870 + }, + { + "epoch": 11.124790151091215, + "grad_norm": 0.33526870608329773, + "learning_rate": 9.841941880361916e-09, + "loss": 0.0068, + "step": 19880 + }, + { + "epoch": 11.130386121992165, + "grad_norm": 0.17259934544563293, + "learning_rate": 8.270008388022721e-09, + "loss": 0.0047, + "step": 19890 + }, + { + "epoch": 11.135982092893117, + "grad_norm": 0.24882031977176666, + "learning_rate": 6.834750376549792e-09, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 11.141578063794068, + "grad_norm": 0.05286456272006035, + "learning_rate": 5.536171769887632e-09, + "loss": 0.0059, + "step": 19910 + }, + { + "epoch": 11.14717403469502, + "grad_norm": 0.08882560580968857, + "learning_rate": 4.3742761183018784e-09, + "loss": 0.0063, + "step": 19920 + }, + { + "epoch": 11.15277000559597, + "grad_norm": 0.09571769833564758, + "learning_rate": 3.349066598362649e-09, + "loss": 0.0034, + "step": 19930 + }, + { + "epoch": 11.158365976496922, + "grad_norm": 0.07795775681734085, + "learning_rate": 2.4605460129556445e-09, + "loss": 0.0029, + "step": 19940 + }, + { + "epoch": 11.163961947397874, + "grad_norm": 0.07696644216775894, + "learning_rate": 1.7087167912710478e-09, + "loss": 0.0083, + "step": 19950 + }, + { + "epoch": 11.169557918298825, + "grad_norm": 0.26498469710350037, + "learning_rate": 1.0935809887702154e-09, + "loss": 0.0033, + "step": 19960 + }, + { + "epoch": 11.175153889199777, + "grad_norm": 0.165630042552948, + "learning_rate": 6.151402872134337e-10, + "loss": 0.007, + "step": 19970 + }, + { + "epoch": 11.180749860100727, + "grad_norm": 0.07009857147932053, + "learning_rate": 2.7339599464326627e-10, + "loss": 0.0037, + "step": 19980 + }, + { + "epoch": 11.18634583100168, + "grad_norm": 0.1754114180803299, + "learning_rate": 6.834904537900144e-11, + "loss": 0.0069, + "step": 19990 + }, + { + "epoch": 11.19194180190263, + "grad_norm": 0.18103741109371185, + "learning_rate": 0.0, + "loss": 0.0044, + "step": 20000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.091345386565736e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b487e2f6b318ff3c24e24d50d3fe45d9b37f56df --- /dev/null +++ b/config.json @@ -0,0 +1,65 @@ +{ + "_name_or_path": "/root/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "action_dim": 32, + "action_head_cfg": { + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "diffusion_model_cfg": { + "attention_head_dim": 48, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 32, + "num_layers": 16, + "output_dim": 1024, + "positional_embeddings": null + }, + "freeze_decode_layer": false, + "hidden_size": 1024, + "input_embedding_dim": 1536, + "load_pretrained_det_decode_layer_path": null, + "max_action_dim": 32, + "max_state_dim": 64, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "tune_diffusion_model": true, + "tune_projector": true + }, + "action_horizon": 16, + "architectures": [ + "GR00T_N1" + ], + "attn_implementation": null, + "backbone_cfg": { + "allow_reshape_visual": true, + "load_pretrained_det_eagle_path": null, + "model_name": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "processor_cfg": { + "max_input_tiles": 1, + "model_path": "$GR00T_BACKBONE_PATH/eagle2_hg_model", + "model_spec": { + "num_image_token": 64, + "template": "qwen2-chat" + } + }, + "projector_dim": 2048, + "remove_llm": false, + "reproject_vision": false, + "scale_image_resolution": 1, + "select_layer": 12, + "tune_llm": false, + "tune_visual": true + }, + "compute_dtype": "bfloat16", + "hidden_size": 1536, + "model_dtype": "float32", + "model_type": "gr00t_n1", + "torch_dtype": "float32", + "transformers_version": "4.45.2" +} diff --git a/experiment_cfg/metadata.json b/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b8c52c4f23fed6f98186e1b3d4b2c344af17aa --- /dev/null +++ b/experiment_cfg/metadata.json @@ -0,0 +1,195 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 50.09765625, + 86.8359375, + 90.439453125, + 80.15625, + 21.005859375 + ], + "min": [ + -27.7734375, + -98.349609375, + -75.5859375, + -34.013671875, + -9.4921875 + ], + "mean": [ + 1.2688713073730469, + -29.948328018188477, + 47.12052536010742, + 34.748233795166016, + 7.337850570678711 + ], + "std": [ + 20.025409698486328, + 44.50356674194336, + 31.859237670898438, + 25.341203689575195, + 5.396989822387695 + ], + "q01": [ + -27.158203125, + -98.26171875, + -37.6171875, + -16.69921875, + -4.306640625 + ], + "q99": [ + 45.703125, + 59.501953125, + 90.439453125, + 77.783203125, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.12890625 + ], + "min": [ + -0.17578125 + ], + "mean": [ + 23.515682220458984 + ], + "std": [ + 15.777379989624023 + ], + "q01": [ + -0.17578125 + ], + "q99": [ + 54.4921875 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 50.888671875, + 83.3203125, + 92.197265625, + 80.33203125, + 21.4453125 + ], + "min": [ + -28.037109375, + -98.876953125, + -84.55078125, + -36.2109375, + -9.140625 + ], + "mean": [ + 1.3174431324005127, + -32.38520431518555, + 43.428009033203125, + 33.60067367553711, + 7.2843017578125 + ], + "std": [ + 19.972745895385742, + 43.20457458496094, + 33.67258834838867, + 25.936344146728516, + 5.434234619140625 + ], + "q01": [ + -27.333984375, + -98.876953125, + -44.736328125, + -18.896484375, + -4.482421875 + ], + "q99": [ + 46.23046875, + 55.458984375, + 92.197265625, + 77.958984375, + 18.544921875 + ] + }, + "gripper": { + "max": [ + 57.392578125 + ], + "min": [ + -0.52734375 + ], + "mean": [ + 21.392284393310547 + ], + "std": [ + 16.467666625976562 + ], + "q01": [ + -0.52734375 + ], + "q99": [ + 54.580078125 + ] + } + } + }, + "modalities": { + "video": { + "cam1": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "cam2": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1af3c78faba3d8e1dafcb8fae5aca98c3d8a2ff8 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868027eb73128c8a91a9183c79c269dded0b87001a41f4836fb8fb945f17f3ee +size 4938446392 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6eb984a90e36390d9452da71b260ffe74d62da9f --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdcd5c4024fdfc3f6132554a06dc56cf2529b6e88cf5cd377def158edaf9b1f8 +size 3821736024 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..fe48f18312a6ba056438ca45f31b81890f021232 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,809 @@ +{ + "metadata": { + "total_size": 8760067008 + }, + "weight_map": { + "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.action_decoder.layer2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W1.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors", + "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.proj_out_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00002-of-00002.safetensors", + "action_head.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.0.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.1.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.10.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.11.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.12.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.13.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.14.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.15.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.2.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.3.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.4.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.5.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.6.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.7.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.8.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_k.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_q.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.attn1.to_v.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.0.proj.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.ff.net.2.weight": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.bias": "model-00002-of-00002.safetensors", + "action_head.model.transformer_blocks.9.norm1.linear.weight": "model-00002-of-00002.safetensors", + "action_head.position_embedding.weight": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer1.b": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.W": "model-00002-of-00002.safetensors", + "action_head.state_encoder.layer2.b": "model-00002-of-00002.safetensors", + "backbone.linear.bias": "model-00002-of-00002.safetensors", + "backbone.linear.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.0.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.1.weight": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.bias": "model-00002-of-00002.safetensors", + "backbone.model.mlp1.3.weight": "model-00002-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "backbone.model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors" + } +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..472bb1ee66b5fcec08ef0fa20563beb4df224f08 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,14042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.19194180190263, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005595970900951315, + "grad_norm": 7.419506072998047, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9689, + "step": 10 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 8.035171508789062, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8977, + "step": 20 + }, + { + "epoch": 0.016787912702853944, + "grad_norm": 7.580524444580078, + "learning_rate": 3e-06, + "loss": 0.9942, + "step": 30 + }, + { + "epoch": 0.02238388360380526, + "grad_norm": 5.7520976066589355, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8421, + "step": 40 + }, + { + "epoch": 0.027979854504756575, + "grad_norm": 4.714428901672363, + "learning_rate": 5e-06, + "loss": 0.6063, + "step": 50 + }, + { + "epoch": 0.03357582540570789, + "grad_norm": 4.136861801147461, + "learning_rate": 6e-06, + "loss": 0.4259, + "step": 60 + }, + { + "epoch": 0.03917179630665921, + "grad_norm": 2.1667540073394775, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3447, + "step": 70 + }, + { + "epoch": 0.04476776720761052, + "grad_norm": 2.3095765113830566, + "learning_rate": 8.000000000000001e-06, + "loss": 0.284, + "step": 80 + }, + { + "epoch": 0.05036373810856184, + "grad_norm": 1.2860591411590576, + "learning_rate": 9e-06, + "loss": 0.2067, + "step": 90 + }, + { + "epoch": 0.05595970900951315, + "grad_norm": 2.0302886962890625, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 100 + }, + { + "epoch": 0.06155567991046446, + "grad_norm": 1.2757196426391602, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1442, + "step": 110 + }, + { + "epoch": 0.06715165081141578, + "grad_norm": 1.5842756032943726, + "learning_rate": 1.2e-05, + "loss": 0.132, + "step": 120 + }, + { + "epoch": 0.0727476217123671, + "grad_norm": 1.0327903032302856, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.097, + "step": 130 + }, + { + "epoch": 0.07834359261331841, + "grad_norm": 0.733019232749939, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0807, + "step": 140 + }, + { + "epoch": 0.08393956351426973, + "grad_norm": 0.9548436999320984, + "learning_rate": 1.5e-05, + "loss": 0.0922, + "step": 150 + }, + { + "epoch": 0.08953553441522104, + "grad_norm": 0.44906941056251526, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0841, + "step": 160 + }, + { + "epoch": 0.09513150531617236, + "grad_norm": 0.9586009979248047, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0726, + "step": 170 + }, + { + "epoch": 0.10072747621712368, + "grad_norm": 0.6236313581466675, + "learning_rate": 1.8e-05, + "loss": 0.0631, + "step": 180 + }, + { + "epoch": 0.10632344711807498, + "grad_norm": 1.1688262224197388, + "learning_rate": 1.9e-05, + "loss": 0.0717, + "step": 190 + }, + { + "epoch": 0.1119194180190263, + "grad_norm": 1.5576119422912598, + "learning_rate": 2e-05, + "loss": 0.0718, + "step": 200 + }, + { + "epoch": 0.11751538891997762, + "grad_norm": 1.0707802772521973, + "learning_rate": 2.1e-05, + "loss": 0.0591, + "step": 210 + }, + { + "epoch": 0.12311135982092893, + "grad_norm": 0.8612272143363953, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0623, + "step": 220 + }, + { + "epoch": 0.12870733072188026, + "grad_norm": 0.796205997467041, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0563, + "step": 230 + }, + { + "epoch": 0.13430330162283155, + "grad_norm": 1.127061367034912, + "learning_rate": 2.4e-05, + "loss": 0.0545, + "step": 240 + }, + { + "epoch": 0.13989927252378287, + "grad_norm": 0.9559623003005981, + "learning_rate": 2.5e-05, + "loss": 0.0543, + "step": 250 + }, + { + "epoch": 0.1454952434247342, + "grad_norm": 0.7295358777046204, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0554, + "step": 260 + }, + { + "epoch": 0.1510912143256855, + "grad_norm": 0.8386074900627136, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0488, + "step": 270 + }, + { + "epoch": 0.15668718522663683, + "grad_norm": 0.9443495869636536, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0639, + "step": 280 + }, + { + "epoch": 0.16228315612758815, + "grad_norm": 0.8754186630249023, + "learning_rate": 2.9e-05, + "loss": 0.0477, + "step": 290 + }, + { + "epoch": 0.16787912702853947, + "grad_norm": 0.5491052269935608, + "learning_rate": 3e-05, + "loss": 0.0509, + "step": 300 + }, + { + "epoch": 0.17347509792949076, + "grad_norm": 0.7870469093322754, + "learning_rate": 3.1e-05, + "loss": 0.0478, + "step": 310 + }, + { + "epoch": 0.17907106883044208, + "grad_norm": 0.9322296380996704, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0514, + "step": 320 + }, + { + "epoch": 0.1846670397313934, + "grad_norm": 1.236414909362793, + "learning_rate": 3.3e-05, + "loss": 0.0504, + "step": 330 + }, + { + "epoch": 0.19026301063234471, + "grad_norm": 1.2571903467178345, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0374, + "step": 340 + }, + { + "epoch": 0.19585898153329603, + "grad_norm": 1.1705288887023926, + "learning_rate": 3.5e-05, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.20145495243424735, + "grad_norm": 1.0005333423614502, + "learning_rate": 3.6e-05, + "loss": 0.0459, + "step": 360 + }, + { + "epoch": 0.20705092333519864, + "grad_norm": 0.5335679054260254, + "learning_rate": 3.7e-05, + "loss": 0.0444, + "step": 370 + }, + { + "epoch": 0.21264689423614996, + "grad_norm": 1.052669882774353, + "learning_rate": 3.8e-05, + "loss": 0.0409, + "step": 380 + }, + { + "epoch": 0.21824286513710128, + "grad_norm": 0.44473376870155334, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0505, + "step": 390 + }, + { + "epoch": 0.2238388360380526, + "grad_norm": 0.6711838841438293, + "learning_rate": 4e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.22943480693900392, + "grad_norm": 0.55412358045578, + "learning_rate": 4.1e-05, + "loss": 0.0416, + "step": 410 + }, + { + "epoch": 0.23503077783995524, + "grad_norm": 1.0375343561172485, + "learning_rate": 4.2e-05, + "loss": 0.0501, + "step": 420 + }, + { + "epoch": 0.24062674874090656, + "grad_norm": 0.7955525517463684, + "learning_rate": 4.3e-05, + "loss": 0.0461, + "step": 430 + }, + { + "epoch": 0.24622271964185785, + "grad_norm": 0.8107234239578247, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0448, + "step": 440 + }, + { + "epoch": 0.2518186905428092, + "grad_norm": 0.8368202447891235, + "learning_rate": 4.5e-05, + "loss": 0.0459, + "step": 450 + }, + { + "epoch": 0.2574146614437605, + "grad_norm": 0.6938339471817017, + "learning_rate": 4.600000000000001e-05, + "loss": 0.034, + "step": 460 + }, + { + "epoch": 0.2630106323447118, + "grad_norm": 0.8612020611763, + "learning_rate": 4.7e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.2686066032456631, + "grad_norm": 0.777197539806366, + "learning_rate": 4.8e-05, + "loss": 0.0381, + "step": 480 + }, + { + "epoch": 0.2742025741466144, + "grad_norm": 0.6520339250564575, + "learning_rate": 4.9e-05, + "loss": 0.0381, + "step": 490 + }, + { + "epoch": 0.27979854504756574, + "grad_norm": 0.5808746814727783, + "learning_rate": 5e-05, + "loss": 0.0285, + "step": 500 + }, + { + "epoch": 0.28539451594851706, + "grad_norm": 0.9482337832450867, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0362, + "step": 510 + }, + { + "epoch": 0.2909904868494684, + "grad_norm": 0.5615134239196777, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0322, + "step": 520 + }, + { + "epoch": 0.2965864577504197, + "grad_norm": 1.2695409059524536, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0411, + "step": 530 + }, + { + "epoch": 0.302182428651371, + "grad_norm": 0.7221632599830627, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0422, + "step": 540 + }, + { + "epoch": 0.30777839955232233, + "grad_norm": 1.1144938468933105, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0334, + "step": 550 + }, + { + "epoch": 0.31337437045327365, + "grad_norm": 0.6722885966300964, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0436, + "step": 560 + }, + { + "epoch": 0.318970341354225, + "grad_norm": 1.0043433904647827, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0452, + "step": 570 + }, + { + "epoch": 0.3245663122551763, + "grad_norm": 0.9483539462089539, + "learning_rate": 5.8e-05, + "loss": 0.0492, + "step": 580 + }, + { + "epoch": 0.3301622831561276, + "grad_norm": 0.7825531363487244, + "learning_rate": 5.9e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.33575825405707893, + "grad_norm": 0.7982919216156006, + "learning_rate": 6e-05, + "loss": 0.0447, + "step": 600 + }, + { + "epoch": 0.3413542249580302, + "grad_norm": 0.9162524342536926, + "learning_rate": 6.1e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.3469501958589815, + "grad_norm": 0.5597997903823853, + "learning_rate": 6.2e-05, + "loss": 0.0393, + "step": 620 + }, + { + "epoch": 0.35254616675993283, + "grad_norm": 0.713256299495697, + "learning_rate": 6.3e-05, + "loss": 0.0394, + "step": 630 + }, + { + "epoch": 0.35814213766088415, + "grad_norm": 0.7356066703796387, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0339, + "step": 640 + }, + { + "epoch": 0.36373810856183547, + "grad_norm": 0.5933259129524231, + "learning_rate": 6.500000000000001e-05, + "loss": 0.038, + "step": 650 + }, + { + "epoch": 0.3693340794627868, + "grad_norm": 0.5277016162872314, + "learning_rate": 6.6e-05, + "loss": 0.0383, + "step": 660 + }, + { + "epoch": 0.3749300503637381, + "grad_norm": 0.9106026887893677, + "learning_rate": 6.7e-05, + "loss": 0.0268, + "step": 670 + }, + { + "epoch": 0.38052602126468943, + "grad_norm": 0.5941755771636963, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0399, + "step": 680 + }, + { + "epoch": 0.38612199216564075, + "grad_norm": 0.7207239270210266, + "learning_rate": 6.9e-05, + "loss": 0.0304, + "step": 690 + }, + { + "epoch": 0.39171796306659207, + "grad_norm": 0.5808258652687073, + "learning_rate": 7e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.3973139339675434, + "grad_norm": 0.6304859519004822, + "learning_rate": 7.1e-05, + "loss": 0.0417, + "step": 710 + }, + { + "epoch": 0.4029099048684947, + "grad_norm": 0.6625694036483765, + "learning_rate": 7.2e-05, + "loss": 0.0301, + "step": 720 + }, + { + "epoch": 0.408505875769446, + "grad_norm": 0.6456591486930847, + "learning_rate": 7.3e-05, + "loss": 0.0416, + "step": 730 + }, + { + "epoch": 0.4141018466703973, + "grad_norm": 0.8103715181350708, + "learning_rate": 7.4e-05, + "loss": 0.0398, + "step": 740 + }, + { + "epoch": 0.4196978175713486, + "grad_norm": 0.592147707939148, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0317, + "step": 750 + }, + { + "epoch": 0.4252937884722999, + "grad_norm": 0.6823825836181641, + "learning_rate": 7.6e-05, + "loss": 0.031, + "step": 760 + }, + { + "epoch": 0.43088975937325125, + "grad_norm": 0.3274383544921875, + "learning_rate": 7.7e-05, + "loss": 0.0305, + "step": 770 + }, + { + "epoch": 0.43648573027420257, + "grad_norm": 0.3436225950717926, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.4420817011751539, + "grad_norm": 0.8361327052116394, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0264, + "step": 790 + }, + { + "epoch": 0.4476776720761052, + "grad_norm": 0.5449605584144592, + "learning_rate": 8e-05, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 0.4532736429770565, + "grad_norm": 0.31227922439575195, + "learning_rate": 8.1e-05, + "loss": 0.0272, + "step": 810 + }, + { + "epoch": 0.45886961387800784, + "grad_norm": 0.6099038124084473, + "learning_rate": 8.2e-05, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.46446558477895916, + "grad_norm": 0.6343345642089844, + "learning_rate": 8.3e-05, + "loss": 0.0343, + "step": 830 + }, + { + "epoch": 0.4700615556799105, + "grad_norm": 0.7962288856506348, + "learning_rate": 8.4e-05, + "loss": 0.0292, + "step": 840 + }, + { + "epoch": 0.4756575265808618, + "grad_norm": 0.3960738182067871, + "learning_rate": 8.5e-05, + "loss": 0.033, + "step": 850 + }, + { + "epoch": 0.4812534974818131, + "grad_norm": 0.9380257725715637, + "learning_rate": 8.6e-05, + "loss": 0.0404, + "step": 860 + }, + { + "epoch": 0.4868494683827644, + "grad_norm": 0.7713156342506409, + "learning_rate": 8.7e-05, + "loss": 0.0387, + "step": 870 + }, + { + "epoch": 0.4924454392837157, + "grad_norm": 1.137207269668579, + "learning_rate": 8.800000000000001e-05, + "loss": 0.039, + "step": 880 + }, + { + "epoch": 0.498041410184667, + "grad_norm": 0.7128203511238098, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0354, + "step": 890 + }, + { + "epoch": 0.5036373810856184, + "grad_norm": 0.6396750211715698, + "learning_rate": 9e-05, + "loss": 0.0367, + "step": 900 + }, + { + "epoch": 0.5092333519865697, + "grad_norm": 0.6838144659996033, + "learning_rate": 9.1e-05, + "loss": 0.0369, + "step": 910 + }, + { + "epoch": 0.514829322887521, + "grad_norm": 0.6156594157218933, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 0.5204252937884724, + "grad_norm": 0.5517926812171936, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0497, + "step": 930 + }, + { + "epoch": 0.5260212646894236, + "grad_norm": 0.6177653670310974, + "learning_rate": 9.4e-05, + "loss": 0.0322, + "step": 940 + }, + { + "epoch": 0.5316172355903749, + "grad_norm": 0.5705161094665527, + "learning_rate": 9.5e-05, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.5372132064913262, + "grad_norm": 0.7966452836990356, + "learning_rate": 9.6e-05, + "loss": 0.0377, + "step": 960 + }, + { + "epoch": 0.5428091773922775, + "grad_norm": 0.7984173893928528, + "learning_rate": 9.7e-05, + "loss": 0.0335, + "step": 970 + }, + { + "epoch": 0.5484051482932288, + "grad_norm": 0.6380477547645569, + "learning_rate": 9.8e-05, + "loss": 0.0329, + "step": 980 + }, + { + "epoch": 0.5540011191941802, + "grad_norm": 0.7180393934249878, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0302, + "step": 990 + }, + { + "epoch": 0.5595970900951315, + "grad_norm": 0.8885056972503662, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1000 + }, + { + "epoch": 0.5651930609960828, + "grad_norm": 0.41542354226112366, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0445, + "step": 1010 + }, + { + "epoch": 0.5707890318970341, + "grad_norm": 0.4343472421169281, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0263, + "step": 1020 + }, + { + "epoch": 0.5763850027979854, + "grad_norm": 0.7970145344734192, + "learning_rate": 9.999938485971279e-05, + "loss": 0.0322, + "step": 1030 + }, + { + "epoch": 0.5819809736989368, + "grad_norm": 0.6129629015922546, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0262, + "step": 1040 + }, + { + "epoch": 0.5875769445998881, + "grad_norm": 0.5661425590515137, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0317, + "step": 1050 + }, + { + "epoch": 0.5931729155008394, + "grad_norm": 0.7532817721366882, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0359, + "step": 1060 + }, + { + "epoch": 0.5987688864017907, + "grad_norm": 0.42677804827690125, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0273, + "step": 1070 + }, + { + "epoch": 0.604364857302742, + "grad_norm": 0.6325145363807678, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0377, + "step": 1080 + }, + { + "epoch": 0.6099608282036934, + "grad_norm": 0.6003039479255676, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.6155567991046447, + "grad_norm": 0.36753129959106445, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 0.621152770005596, + "grad_norm": 0.43158769607543945, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0275, + "step": 1110 + }, + { + "epoch": 0.6267487409065473, + "grad_norm": 0.33566170930862427, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0278, + "step": 1120 + }, + { + "epoch": 0.6323447118074986, + "grad_norm": 0.671672523021698, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0344, + "step": 1130 + }, + { + "epoch": 0.63794068270845, + "grad_norm": 1.1190325021743774, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0304, + "step": 1140 + }, + { + "epoch": 0.6435366536094013, + "grad_norm": 0.6546229124069214, + "learning_rate": 9.998462224960175e-05, + "loss": 0.0343, + "step": 1150 + }, + { + "epoch": 0.6491326245103526, + "grad_norm": 0.7560105323791504, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0259, + "step": 1160 + }, + { + "epoch": 0.6547285954113039, + "grad_norm": 0.6937676072120667, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0308, + "step": 1170 + }, + { + "epoch": 0.6603245663122552, + "grad_norm": 0.4479691684246063, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0272, + "step": 1180 + }, + { + "epoch": 0.6659205372132065, + "grad_norm": 0.38218632340431213, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0313, + "step": 1190 + }, + { + "epoch": 0.6715165081141579, + "grad_norm": 0.3345787525177002, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0328, + "step": 1200 + }, + { + "epoch": 0.6771124790151091, + "grad_norm": 0.3578011989593506, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.6827084499160604, + "grad_norm": 0.6602341532707214, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0346, + "step": 1220 + }, + { + "epoch": 0.6883044208170117, + "grad_norm": 0.4503819942474365, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0243, + "step": 1230 + }, + { + "epoch": 0.693900391717963, + "grad_norm": 0.753041684627533, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0277, + "step": 1240 + }, + { + "epoch": 0.6994963626189143, + "grad_norm": 0.3396258056163788, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0219, + "step": 1250 + }, + { + "epoch": 0.7050923335198657, + "grad_norm": 0.6529501676559448, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0242, + "step": 1260 + }, + { + "epoch": 0.710688304420817, + "grad_norm": 0.2462773472070694, + "learning_rate": 9.9950181809607e-05, + "loss": 0.021, + "step": 1270 + }, + { + "epoch": 0.7162842753217683, + "grad_norm": 0.4511205554008484, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0267, + "step": 1280 + }, + { + "epoch": 0.7218802462227196, + "grad_norm": 0.5708833336830139, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0288, + "step": 1290 + }, + { + "epoch": 0.7274762171236709, + "grad_norm": 0.4378319978713989, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0308, + "step": 1300 + }, + { + "epoch": 0.7330721880246223, + "grad_norm": 0.44127964973449707, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.7386681589255736, + "grad_norm": 0.35624831914901733, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0298, + "step": 1320 + }, + { + "epoch": 0.7442641298265249, + "grad_norm": 0.45579585433006287, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0325, + "step": 1330 + }, + { + "epoch": 0.7498601007274762, + "grad_norm": 0.6297839283943176, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0369, + "step": 1340 + }, + { + "epoch": 0.7554560716284275, + "grad_norm": 0.29105043411254883, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0253, + "step": 1350 + }, + { + "epoch": 0.7610520425293789, + "grad_norm": 0.501181960105896, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0355, + "step": 1360 + }, + { + "epoch": 0.7666480134303302, + "grad_norm": 0.4630679488182068, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0264, + "step": 1370 + }, + { + "epoch": 0.7722439843312815, + "grad_norm": 0.6088075637817383, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0282, + "step": 1380 + }, + { + "epoch": 0.7778399552322328, + "grad_norm": 0.5682616233825684, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0331, + "step": 1390 + }, + { + "epoch": 0.7834359261331841, + "grad_norm": 0.4457339644432068, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0309, + "step": 1400 + }, + { + "epoch": 0.7890318970341355, + "grad_norm": 0.566882848739624, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0436, + "step": 1410 + }, + { + "epoch": 0.7946278679350868, + "grad_norm": 0.4208590090274811, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0293, + "step": 1420 + }, + { + "epoch": 0.8002238388360381, + "grad_norm": 0.5373462438583374, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0333, + "step": 1430 + }, + { + "epoch": 0.8058198097369894, + "grad_norm": 0.4833603799343109, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0238, + "step": 1440 + }, + { + "epoch": 0.8114157806379407, + "grad_norm": 0.3185485303401947, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0279, + "step": 1450 + }, + { + "epoch": 0.817011751538892, + "grad_norm": 0.32943880558013916, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0259, + "step": 1460 + }, + { + "epoch": 0.8226077224398433, + "grad_norm": 0.4028552174568176, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0183, + "step": 1470 + }, + { + "epoch": 0.8282036933407946, + "grad_norm": 0.3354315459728241, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0279, + "step": 1480 + }, + { + "epoch": 0.8337996642417459, + "grad_norm": 0.581444263458252, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0231, + "step": 1490 + }, + { + "epoch": 0.8393956351426972, + "grad_norm": 0.3263351321220398, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0257, + "step": 1500 + }, + { + "epoch": 0.8449916060436485, + "grad_norm": 0.4574286639690399, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0172, + "step": 1510 + }, + { + "epoch": 0.8505875769445999, + "grad_norm": 0.6482700705528259, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0252, + "step": 1520 + }, + { + "epoch": 0.8561835478455512, + "grad_norm": 0.22327029705047607, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0296, + "step": 1530 + }, + { + "epoch": 0.8617795187465025, + "grad_norm": 0.39261817932128906, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0226, + "step": 1540 + }, + { + "epoch": 0.8673754896474538, + "grad_norm": 0.3742023706436157, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0283, + "step": 1550 + }, + { + "epoch": 0.8729714605484051, + "grad_norm": 0.240834578871727, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0176, + "step": 1560 + }, + { + "epoch": 0.8785674314493565, + "grad_norm": 0.39040738344192505, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0225, + "step": 1570 + }, + { + "epoch": 0.8841634023503078, + "grad_norm": 0.3102349042892456, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0229, + "step": 1580 + }, + { + "epoch": 0.8897593732512591, + "grad_norm": 0.32893484830856323, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0286, + "step": 1590 + }, + { + "epoch": 0.8953553441522104, + "grad_norm": 0.3821198046207428, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0278, + "step": 1600 + }, + { + "epoch": 0.9009513150531617, + "grad_norm": 0.3672045171260834, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0275, + "step": 1610 + }, + { + "epoch": 0.906547285954113, + "grad_norm": 0.36223965883255005, + "learning_rate": 9.973749622593534e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 0.9121432568550644, + "grad_norm": 0.5474312901496887, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0307, + "step": 1630 + }, + { + "epoch": 0.9177392277560157, + "grad_norm": 0.7324241399765015, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0246, + "step": 1640 + }, + { + "epoch": 0.923335198656967, + "grad_norm": 0.44370922446250916, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0229, + "step": 1650 + }, + { + "epoch": 0.9289311695579183, + "grad_norm": 0.40400007367134094, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0368, + "step": 1660 + }, + { + "epoch": 0.9345271404588696, + "grad_norm": 0.4597970247268677, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0215, + "step": 1670 + }, + { + "epoch": 0.940123111359821, + "grad_norm": 0.41508862376213074, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0251, + "step": 1680 + }, + { + "epoch": 0.9457190822607723, + "grad_norm": 0.5726234316825867, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0385, + "step": 1690 + }, + { + "epoch": 0.9513150531617236, + "grad_norm": 0.47390761971473694, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0269, + "step": 1700 + }, + { + "epoch": 0.9569110240626749, + "grad_norm": 0.3252114951610565, + "learning_rate": 9.965584791221048e-05, + "loss": 0.023, + "step": 1710 + }, + { + "epoch": 0.9625069949636262, + "grad_norm": 0.4773138761520386, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0322, + "step": 1720 + }, + { + "epoch": 0.9681029658645776, + "grad_norm": 0.45844170451164246, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 0.9736989367655288, + "grad_norm": 0.40978696942329407, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0337, + "step": 1740 + }, + { + "epoch": 0.9792949076664801, + "grad_norm": 0.43942537903785706, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0225, + "step": 1750 + }, + { + "epoch": 0.9848908785674314, + "grad_norm": 0.7744397521018982, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 0.9904868494683827, + "grad_norm": 0.3644595444202423, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0252, + "step": 1770 + }, + { + "epoch": 0.996082820369334, + "grad_norm": 0.29574769735336304, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 1.0016787912702854, + "grad_norm": 0.5153500437736511, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 1.0072747621712368, + "grad_norm": 0.5961137413978577, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 1.012870733072188, + "grad_norm": 0.48836737871170044, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0213, + "step": 1810 + }, + { + "epoch": 1.0184667039731394, + "grad_norm": 0.5610430240631104, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0205, + "step": 1820 + }, + { + "epoch": 1.0240626748740906, + "grad_norm": 0.4025803804397583, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0224, + "step": 1830 + }, + { + "epoch": 1.029658645775042, + "grad_norm": 0.605367124080658, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0303, + "step": 1840 + }, + { + "epoch": 1.0352546166759933, + "grad_norm": 0.3206970989704132, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0231, + "step": 1850 + }, + { + "epoch": 1.0408505875769447, + "grad_norm": 0.3495715260505676, + "learning_rate": 9.949534157133844e-05, + "loss": 0.024, + "step": 1860 + }, + { + "epoch": 1.046446558477896, + "grad_norm": 0.3895197808742523, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0203, + "step": 1870 + }, + { + "epoch": 1.0520425293788471, + "grad_norm": 0.40038052201271057, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0221, + "step": 1880 + }, + { + "epoch": 1.0576385002797986, + "grad_norm": 0.479744553565979, + "learning_rate": 9.945958340417283e-05, + "loss": 0.028, + "step": 1890 + }, + { + "epoch": 1.0632344711807498, + "grad_norm": 0.3020111322402954, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0265, + "step": 1900 + }, + { + "epoch": 1.0688304420817012, + "grad_norm": 0.3391585648059845, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0233, + "step": 1910 + }, + { + "epoch": 1.0744264129826524, + "grad_norm": 0.3941816985607147, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0184, + "step": 1920 + }, + { + "epoch": 1.0800223838836038, + "grad_norm": 0.31161707639694214, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0229, + "step": 1930 + }, + { + "epoch": 1.085618354784555, + "grad_norm": 0.33263275027275085, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0227, + "step": 1940 + }, + { + "epoch": 1.0912143256855065, + "grad_norm": 0.35178300738334656, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0224, + "step": 1950 + }, + { + "epoch": 1.0968102965864577, + "grad_norm": 0.374667227268219, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0196, + "step": 1960 + }, + { + "epoch": 1.102406267487409, + "grad_norm": 0.2080841362476349, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0274, + "step": 1970 + }, + { + "epoch": 1.1080022383883603, + "grad_norm": 0.29197070002555847, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0242, + "step": 1980 + }, + { + "epoch": 1.1135982092893117, + "grad_norm": 0.32980409264564514, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0189, + "step": 1990 + }, + { + "epoch": 1.119194180190263, + "grad_norm": 0.4776092767715454, + "learning_rate": 9.931806517013612e-05, + "loss": 0.022, + "step": 2000 + }, + { + "epoch": 1.1247901510912144, + "grad_norm": 0.37389442324638367, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0216, + "step": 2010 + }, + { + "epoch": 1.1303861219921656, + "grad_norm": 0.22275716066360474, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0192, + "step": 2020 + }, + { + "epoch": 1.135982092893117, + "grad_norm": 0.5097452402114868, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0198, + "step": 2030 + }, + { + "epoch": 1.1415780637940682, + "grad_norm": 0.3198114037513733, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0218, + "step": 2040 + }, + { + "epoch": 1.1471740346950197, + "grad_norm": 0.1620880514383316, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0227, + "step": 2050 + }, + { + "epoch": 1.1527700055959709, + "grad_norm": 0.2927526831626892, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0195, + "step": 2060 + }, + { + "epoch": 1.1583659764969223, + "grad_norm": 0.2967079281806946, + "learning_rate": 9.921951064166684e-05, + "loss": 0.024, + "step": 2070 + }, + { + "epoch": 1.1639619473978735, + "grad_norm": 0.19401852786540985, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.169557918298825, + "grad_norm": 0.28363627195358276, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0188, + "step": 2090 + }, + { + "epoch": 1.1751538891997761, + "grad_norm": 0.3623961806297302, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0206, + "step": 2100 + }, + { + "epoch": 1.1807498601007276, + "grad_norm": 0.503246545791626, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0253, + "step": 2110 + }, + { + "epoch": 1.1863458310016788, + "grad_norm": 0.7744673490524292, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 1.19194180190263, + "grad_norm": 0.48357081413269043, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0309, + "step": 2130 + }, + { + "epoch": 1.1975377728035814, + "grad_norm": 0.22658684849739075, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0208, + "step": 2140 + }, + { + "epoch": 1.2031337437045329, + "grad_norm": 0.40776172280311584, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 1.208729714605484, + "grad_norm": 0.48974546790122986, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0278, + "step": 2160 + }, + { + "epoch": 1.2143256855064353, + "grad_norm": 0.3066832423210144, + "learning_rate": 9.90672840803519e-05, + "loss": 0.018, + "step": 2170 + }, + { + "epoch": 1.2199216564073867, + "grad_norm": 0.22434163093566895, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0141, + "step": 2180 + }, + { + "epoch": 1.225517627308338, + "grad_norm": 0.3365159034729004, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0205, + "step": 2190 + }, + { + "epoch": 1.2311135982092893, + "grad_norm": 0.3467719256877899, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0206, + "step": 2200 + }, + { + "epoch": 1.2367095691102405, + "grad_norm": 0.31818097829818726, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0255, + "step": 2210 + }, + { + "epoch": 1.242305540011192, + "grad_norm": 0.3118780851364136, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0211, + "step": 2220 + }, + { + "epoch": 1.2479015109121432, + "grad_norm": 0.2563456594944, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0249, + "step": 2230 + }, + { + "epoch": 1.2534974818130946, + "grad_norm": 0.4434971213340759, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0214, + "step": 2240 + }, + { + "epoch": 1.2590934527140458, + "grad_norm": 0.36243245005607605, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0239, + "step": 2250 + }, + { + "epoch": 1.2646894236149973, + "grad_norm": 0.4027983546257019, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 1.2702853945159485, + "grad_norm": 0.4992479383945465, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 1.2758813654169, + "grad_norm": 0.5188339948654175, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0252, + "step": 2280 + }, + { + "epoch": 1.281477336317851, + "grad_norm": 0.2691977620124817, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0238, + "step": 2290 + }, + { + "epoch": 1.2870733072188025, + "grad_norm": 0.42759424448013306, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0252, + "step": 2300 + }, + { + "epoch": 1.2926692781197537, + "grad_norm": 0.315560519695282, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0184, + "step": 2310 + }, + { + "epoch": 1.2982652490207052, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.881380604901964e-05, + "loss": 0.026, + "step": 2320 + }, + { + "epoch": 1.3038612199216564, + "grad_norm": 0.322465717792511, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0217, + "step": 2330 + }, + { + "epoch": 1.3094571908226076, + "grad_norm": 0.31809547543525696, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0219, + "step": 2340 + }, + { + "epoch": 1.315053161723559, + "grad_norm": 0.4411179721355438, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0221, + "step": 2350 + }, + { + "epoch": 1.3206491326245104, + "grad_norm": 0.44775789976119995, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0234, + "step": 2360 + }, + { + "epoch": 1.3262451035254617, + "grad_norm": 0.5176445245742798, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.3318410744264129, + "grad_norm": 0.36430883407592773, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0205, + "step": 2380 + }, + { + "epoch": 1.3374370453273643, + "grad_norm": 0.5294170379638672, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0266, + "step": 2390 + }, + { + "epoch": 1.3430330162283157, + "grad_norm": 0.3633783459663391, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0208, + "step": 2400 + }, + { + "epoch": 1.348628987129267, + "grad_norm": 0.5161033272743225, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0201, + "step": 2410 + }, + { + "epoch": 1.3542249580302181, + "grad_norm": 0.6746691465377808, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0243, + "step": 2420 + }, + { + "epoch": 1.3598209289311696, + "grad_norm": 0.2213054746389389, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0209, + "step": 2430 + }, + { + "epoch": 1.365416899832121, + "grad_norm": 0.6545590162277222, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0225, + "step": 2440 + }, + { + "epoch": 1.3710128707330722, + "grad_norm": 0.46804091334342957, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0271, + "step": 2450 + }, + { + "epoch": 1.3766088416340234, + "grad_norm": 0.38381436467170715, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0233, + "step": 2460 + }, + { + "epoch": 1.3822048125349748, + "grad_norm": 0.41659992933273315, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0229, + "step": 2470 + }, + { + "epoch": 1.387800783435926, + "grad_norm": 0.4473920464515686, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0278, + "step": 2480 + }, + { + "epoch": 1.3933967543368775, + "grad_norm": 0.3903592824935913, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0222, + "step": 2490 + }, + { + "epoch": 1.3989927252378287, + "grad_norm": 0.296999454498291, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0287, + "step": 2500 + }, + { + "epoch": 1.4045886961387801, + "grad_norm": 0.45139339566230774, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0253, + "step": 2510 + }, + { + "epoch": 1.4101846670397313, + "grad_norm": 0.29245492815971375, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0149, + "step": 2520 + }, + { + "epoch": 1.4157806379406828, + "grad_norm": 0.2889615595340729, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0224, + "step": 2530 + }, + { + "epoch": 1.421376608841634, + "grad_norm": 0.4102277457714081, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0268, + "step": 2540 + }, + { + "epoch": 1.4269725797425854, + "grad_norm": 0.5045889616012573, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0195, + "step": 2550 + }, + { + "epoch": 1.4325685506435366, + "grad_norm": 0.5412267446517944, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0262, + "step": 2560 + }, + { + "epoch": 1.438164521544488, + "grad_norm": 0.5022779703140259, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0268, + "step": 2570 + }, + { + "epoch": 1.4437604924454392, + "grad_norm": 0.5818321108818054, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0262, + "step": 2580 + }, + { + "epoch": 1.4493564633463907, + "grad_norm": 0.3627963066101074, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0161, + "step": 2590 + }, + { + "epoch": 1.4549524342473419, + "grad_norm": 0.35047340393066406, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0245, + "step": 2600 + }, + { + "epoch": 1.4605484051482933, + "grad_norm": 0.2970013916492462, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0206, + "step": 2610 + }, + { + "epoch": 1.4661443760492445, + "grad_norm": 0.39108118414878845, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0229, + "step": 2620 + }, + { + "epoch": 1.4717403469501957, + "grad_norm": 0.30723538994789124, + "learning_rate": 9.819499966239243e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 1.4773363178511472, + "grad_norm": 0.316388338804245, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0232, + "step": 2640 + }, + { + "epoch": 1.4829322887520986, + "grad_norm": 0.2693226635456085, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0201, + "step": 2650 + }, + { + "epoch": 1.4885282596530498, + "grad_norm": 0.2165406197309494, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0178, + "step": 2660 + }, + { + "epoch": 1.494124230554001, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0247, + "step": 2670 + }, + { + "epoch": 1.4997202014549524, + "grad_norm": 0.37577569484710693, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0188, + "step": 2680 + }, + { + "epoch": 1.5053161723559039, + "grad_norm": 0.3397989273071289, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0174, + "step": 2690 + }, + { + "epoch": 1.510912143256855, + "grad_norm": 0.11495699733495712, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0193, + "step": 2700 + }, + { + "epoch": 1.5165081141578063, + "grad_norm": 0.3947618305683136, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0195, + "step": 2710 + }, + { + "epoch": 1.5221040850587577, + "grad_norm": 0.3024958670139313, + "learning_rate": 9.799155349053851e-05, + "loss": 0.021, + "step": 2720 + }, + { + "epoch": 1.5277000559597091, + "grad_norm": 0.3651089072227478, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0148, + "step": 2730 + }, + { + "epoch": 1.5332960268606604, + "grad_norm": 0.6126254796981812, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0187, + "step": 2740 + }, + { + "epoch": 1.5388919977616116, + "grad_norm": 0.35577818751335144, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0183, + "step": 2750 + }, + { + "epoch": 1.544487968662563, + "grad_norm": 0.26784461736679077, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0239, + "step": 2760 + }, + { + "epoch": 1.5500839395635144, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0174, + "step": 2770 + }, + { + "epoch": 1.5556799104644656, + "grad_norm": 0.3289090394973755, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0185, + "step": 2780 + }, + { + "epoch": 1.5612758813654168, + "grad_norm": 0.41667595505714417, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0264, + "step": 2790 + }, + { + "epoch": 1.5668718522663683, + "grad_norm": 0.4217163324356079, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0221, + "step": 2800 + }, + { + "epoch": 1.5724678231673195, + "grad_norm": 0.3442951440811157, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0195, + "step": 2810 + }, + { + "epoch": 1.578063794068271, + "grad_norm": 0.38543257117271423, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0189, + "step": 2820 + }, + { + "epoch": 1.5836597649692221, + "grad_norm": 0.6017774939537048, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0254, + "step": 2830 + }, + { + "epoch": 1.5892557358701733, + "grad_norm": 0.5754305720329285, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0224, + "step": 2840 + }, + { + "epoch": 1.5948517067711248, + "grad_norm": 0.2952113747596741, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0209, + "step": 2850 + }, + { + "epoch": 1.6004476776720762, + "grad_norm": 0.3667709231376648, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0221, + "step": 2860 + }, + { + "epoch": 1.6060436485730274, + "grad_norm": 0.543677031993866, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0216, + "step": 2870 + }, + { + "epoch": 1.6116396194739786, + "grad_norm": 0.3521057069301605, + "learning_rate": 9.760366073392246e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 1.61723559037493, + "grad_norm": 0.35763946175575256, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0244, + "step": 2890 + }, + { + "epoch": 1.6228315612758815, + "grad_norm": 0.25549840927124023, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 1.6284275321768327, + "grad_norm": 0.22006206214427948, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0178, + "step": 2910 + }, + { + "epoch": 1.6340235030777839, + "grad_norm": 0.2791355550289154, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0204, + "step": 2920 + }, + { + "epoch": 1.6396194739787353, + "grad_norm": 0.34600383043289185, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0206, + "step": 2930 + }, + { + "epoch": 1.6452154448796867, + "grad_norm": 0.40189531445503235, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0213, + "step": 2940 + }, + { + "epoch": 1.650811415780638, + "grad_norm": 0.21385939419269562, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0287, + "step": 2950 + }, + { + "epoch": 1.6564073866815892, + "grad_norm": 0.4269281327724457, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0226, + "step": 2960 + }, + { + "epoch": 1.6620033575825406, + "grad_norm": 0.46277040243148804, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0206, + "step": 2970 + }, + { + "epoch": 1.667599328483492, + "grad_norm": 0.340044230222702, + "learning_rate": 9.734429148174675e-05, + "loss": 0.016, + "step": 2980 + }, + { + "epoch": 1.6731952993844432, + "grad_norm": 0.33839765191078186, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 1.6787912702853944, + "grad_norm": 0.4214085042476654, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0291, + "step": 3000 + }, + { + "epoch": 1.6843872411863459, + "grad_norm": 0.29594293236732483, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0194, + "step": 3010 + }, + { + "epoch": 1.6899832120872973, + "grad_norm": 0.43080446124076843, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0204, + "step": 3020 + }, + { + "epoch": 1.6955791829882485, + "grad_norm": 0.3255208134651184, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0219, + "step": 3030 + }, + { + "epoch": 1.7011751538891997, + "grad_norm": 0.30094242095947266, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0207, + "step": 3040 + }, + { + "epoch": 1.7067711247901511, + "grad_norm": 0.27606436610221863, + "learning_rate": 9.715502728715826e-05, + "loss": 0.025, + "step": 3050 + }, + { + "epoch": 1.7123670956911026, + "grad_norm": 0.21307139098644257, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 1.7179630665920538, + "grad_norm": 0.4076824188232422, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0181, + "step": 3070 + }, + { + "epoch": 1.723559037493005, + "grad_norm": 0.3973149359226227, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0278, + "step": 3080 + }, + { + "epoch": 1.7291550083939562, + "grad_norm": 0.3367111086845398, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0284, + "step": 3090 + }, + { + "epoch": 1.7347509792949076, + "grad_norm": 0.4137897193431854, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0251, + "step": 3100 + }, + { + "epoch": 1.740346950195859, + "grad_norm": 0.28888463973999023, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0185, + "step": 3110 + }, + { + "epoch": 1.7459429210968103, + "grad_norm": 0.2732876241207123, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0206, + "step": 3120 + }, + { + "epoch": 1.7515388919977615, + "grad_norm": 0.5475505590438843, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 1.757134862898713, + "grad_norm": 0.3212341070175171, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0193, + "step": 3140 + }, + { + "epoch": 1.7627308337996643, + "grad_norm": 0.38309773802757263, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0228, + "step": 3150 + }, + { + "epoch": 1.7683268047006155, + "grad_norm": 0.22085356712341309, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0167, + "step": 3160 + }, + { + "epoch": 1.7739227756015667, + "grad_norm": 0.32358717918395996, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0216, + "step": 3170 + }, + { + "epoch": 1.7795187465025182, + "grad_norm": 0.30354073643684387, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0202, + "step": 3180 + }, + { + "epoch": 1.7851147174034696, + "grad_norm": 0.3479655981063843, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0167, + "step": 3190 + }, + { + "epoch": 1.7907106883044208, + "grad_norm": 0.3674020767211914, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 1.796306659205372, + "grad_norm": 0.2632925808429718, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0166, + "step": 3210 + }, + { + "epoch": 1.8019026301063235, + "grad_norm": 0.22815559804439545, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0182, + "step": 3220 + }, + { + "epoch": 1.8074986010072749, + "grad_norm": 0.2246052771806717, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0186, + "step": 3230 + }, + { + "epoch": 1.813094571908226, + "grad_norm": 0.28712260723114014, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0157, + "step": 3240 + }, + { + "epoch": 1.8186905428091773, + "grad_norm": 0.2282487452030182, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0201, + "step": 3250 + }, + { + "epoch": 1.8242865137101287, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0153, + "step": 3260 + }, + { + "epoch": 1.8298824846110802, + "grad_norm": 0.3519797623157501, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0217, + "step": 3270 + }, + { + "epoch": 1.8354784555120314, + "grad_norm": 0.29638567566871643, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0165, + "step": 3280 + }, + { + "epoch": 1.8410744264129826, + "grad_norm": 0.3102523982524872, + "learning_rate": 9.645832661709444e-05, + "loss": 0.02, + "step": 3290 + }, + { + "epoch": 1.846670397313934, + "grad_norm": 0.31784892082214355, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 1.8522663682148854, + "grad_norm": 0.31783589720726013, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0154, + "step": 3310 + }, + { + "epoch": 1.8578623391158366, + "grad_norm": 0.4002092778682709, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0128, + "step": 3320 + }, + { + "epoch": 1.8634583100167879, + "grad_norm": 0.3656691610813141, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0167, + "step": 3330 + }, + { + "epoch": 1.869054280917739, + "grad_norm": 0.34003934264183044, + "learning_rate": 9.630393468087818e-05, + "loss": 0.018, + "step": 3340 + }, + { + "epoch": 1.8746502518186905, + "grad_norm": 0.3051067888736725, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0192, + "step": 3350 + }, + { + "epoch": 1.880246222719642, + "grad_norm": 0.32361093163490295, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0262, + "step": 3360 + }, + { + "epoch": 1.8858421936205931, + "grad_norm": 0.20856234431266785, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0259, + "step": 3370 + }, + { + "epoch": 1.8914381645215443, + "grad_norm": 0.3916553258895874, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0184, + "step": 3380 + }, + { + "epoch": 1.8970341354224958, + "grad_norm": 0.461211621761322, + "learning_rate": 9.614637793223425e-05, + "loss": 0.018, + "step": 3390 + }, + { + "epoch": 1.9026301063234472, + "grad_norm": 0.4060401916503906, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0196, + "step": 3400 + }, + { + "epoch": 1.9082260772243984, + "grad_norm": 0.362894207239151, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0149, + "step": 3410 + }, + { + "epoch": 1.9138220481253496, + "grad_norm": 0.2224276214838028, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0214, + "step": 3420 + }, + { + "epoch": 1.919418019026301, + "grad_norm": 0.36570799350738525, + "learning_rate": 9.601806109775179e-05, + "loss": 0.019, + "step": 3430 + }, + { + "epoch": 1.9250139899272525, + "grad_norm": 0.37845227122306824, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0283, + "step": 3440 + }, + { + "epoch": 1.9306099608282037, + "grad_norm": 0.2989262044429779, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0195, + "step": 3450 + }, + { + "epoch": 1.936205931729155, + "grad_norm": 0.4651845097541809, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0221, + "step": 3460 + }, + { + "epoch": 1.9418019026301063, + "grad_norm": 0.16341492533683777, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0189, + "step": 3470 + }, + { + "epoch": 1.9473978735310578, + "grad_norm": 0.3499149978160858, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 1.952993844432009, + "grad_norm": 0.5015300512313843, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0287, + "step": 3490 + }, + { + "epoch": 1.9585898153329602, + "grad_norm": 0.3239698112010956, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0168, + "step": 3500 + }, + { + "epoch": 1.9641857862339116, + "grad_norm": 0.29603099822998047, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0204, + "step": 3510 + }, + { + "epoch": 1.969781757134863, + "grad_norm": 0.4523886740207672, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0247, + "step": 3520 + }, + { + "epoch": 1.9753777280358142, + "grad_norm": 0.2664707899093628, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0155, + "step": 3530 + }, + { + "epoch": 1.9809736989367654, + "grad_norm": 0.3717735707759857, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0279, + "step": 3540 + }, + { + "epoch": 1.9865696698377169, + "grad_norm": 0.4721260070800781, + "learning_rate": 9.562105561188069e-05, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 1.9921656407386683, + "grad_norm": 0.19504283368587494, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0251, + "step": 3560 + }, + { + "epoch": 1.9977616116396195, + "grad_norm": 0.3900291919708252, + "learning_rate": 9.555313759603402e-05, + "loss": 0.028, + "step": 3570 + }, + { + "epoch": 2.0033575825405707, + "grad_norm": 0.3327538073062897, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0214, + "step": 3580 + }, + { + "epoch": 2.008953553441522, + "grad_norm": 0.5092990398406982, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0204, + "step": 3590 + }, + { + "epoch": 2.0145495243424736, + "grad_norm": 0.2563795745372772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0242, + "step": 3600 + }, + { + "epoch": 2.020145495243425, + "grad_norm": 0.1788598746061325, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0189, + "step": 3610 + }, + { + "epoch": 2.025741466144376, + "grad_norm": 0.2857683598995209, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0187, + "step": 3620 + }, + { + "epoch": 2.031337437045327, + "grad_norm": 0.25776809453964233, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0176, + "step": 3630 + }, + { + "epoch": 2.036933407946279, + "grad_norm": 0.37827619910240173, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0133, + "step": 3640 + }, + { + "epoch": 2.04252937884723, + "grad_norm": 0.36484652757644653, + "learning_rate": 9.527649142357596e-05, + "loss": 0.021, + "step": 3650 + }, + { + "epoch": 2.0481253497481813, + "grad_norm": 0.41479215025901794, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.0537213206491325, + "grad_norm": 0.261192262172699, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 2.059317291550084, + "grad_norm": 0.3758920431137085, + "learning_rate": 9.517070405476575e-05, + "loss": 0.02, + "step": 3680 + }, + { + "epoch": 2.0649132624510353, + "grad_norm": 0.33406686782836914, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0176, + "step": 3690 + }, + { + "epoch": 2.0705092333519866, + "grad_norm": 0.18889296054840088, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0167, + "step": 3700 + }, + { + "epoch": 2.0761052042529378, + "grad_norm": 0.231406569480896, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0232, + "step": 3710 + }, + { + "epoch": 2.0817011751538894, + "grad_norm": 0.31842225790023804, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0219, + "step": 3720 + }, + { + "epoch": 2.0872971460548406, + "grad_norm": 0.2191598266363144, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0207, + "step": 3730 + }, + { + "epoch": 2.092893116955792, + "grad_norm": 0.3848901391029358, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0205, + "step": 3740 + }, + { + "epoch": 2.098489087856743, + "grad_norm": 0.3654007017612457, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0202, + "step": 3750 + }, + { + "epoch": 2.1040850587576942, + "grad_norm": 0.3708373010158539, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0186, + "step": 3760 + }, + { + "epoch": 2.109681029658646, + "grad_norm": 0.29888278245925903, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0179, + "step": 3770 + }, + { + "epoch": 2.115277000559597, + "grad_norm": 0.3273047208786011, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0194, + "step": 3780 + }, + { + "epoch": 2.1208729714605483, + "grad_norm": 0.30253902077674866, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0172, + "step": 3790 + }, + { + "epoch": 2.1264689423614995, + "grad_norm": 0.3017847537994385, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0239, + "step": 3800 + }, + { + "epoch": 2.132064913262451, + "grad_norm": 0.2024342566728592, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0229, + "step": 3810 + }, + { + "epoch": 2.1376608841634024, + "grad_norm": 0.25708290934562683, + "learning_rate": 9.46623765919727e-05, + "loss": 0.017, + "step": 3820 + }, + { + "epoch": 2.1432568550643536, + "grad_norm": 0.38740572333335876, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0186, + "step": 3830 + }, + { + "epoch": 2.148852825965305, + "grad_norm": 0.41047894954681396, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0197, + "step": 3840 + }, + { + "epoch": 2.1544487968662565, + "grad_norm": 0.26995226740837097, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0183, + "step": 3850 + }, + { + "epoch": 2.1600447677672077, + "grad_norm": 0.3127893805503845, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0206, + "step": 3860 + }, + { + "epoch": 2.165640738668159, + "grad_norm": 0.33325016498565674, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0208, + "step": 3870 + }, + { + "epoch": 2.17123670956911, + "grad_norm": 0.2265041172504425, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0177, + "step": 3880 + }, + { + "epoch": 2.1768326804700617, + "grad_norm": 0.44378480315208435, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0232, + "step": 3890 + }, + { + "epoch": 2.182428651371013, + "grad_norm": 0.2953107953071594, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0201, + "step": 3900 + }, + { + "epoch": 2.188024622271964, + "grad_norm": 0.3876049518585205, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0164, + "step": 3910 + }, + { + "epoch": 2.1936205931729154, + "grad_norm": 0.2669300138950348, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0153, + "step": 3920 + }, + { + "epoch": 2.199216564073867, + "grad_norm": 0.6801855564117432, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0185, + "step": 3930 + }, + { + "epoch": 2.204812534974818, + "grad_norm": 0.3502347469329834, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0253, + "step": 3940 + }, + { + "epoch": 2.2104085058757694, + "grad_norm": 0.3213407099246979, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0221, + "step": 3950 + }, + { + "epoch": 2.2160044767767206, + "grad_norm": 0.45591723918914795, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0303, + "step": 3960 + }, + { + "epoch": 2.2216004476776723, + "grad_norm": 0.5190838575363159, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 2.2271964185786235, + "grad_norm": 0.24658669531345367, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0169, + "step": 3980 + }, + { + "epoch": 2.2327923894795747, + "grad_norm": 0.26745668053627014, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0174, + "step": 3990 + }, + { + "epoch": 2.238388360380526, + "grad_norm": 0.23573242127895355, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0166, + "step": 4000 + }, + { + "epoch": 2.243984331281477, + "grad_norm": 0.38697415590286255, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0175, + "step": 4010 + }, + { + "epoch": 2.2495803021824288, + "grad_norm": 0.26302671432495117, + "learning_rate": 9.389475079423988e-05, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.25517627308338, + "grad_norm": 0.520627498626709, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0196, + "step": 4030 + }, + { + "epoch": 2.260772243984331, + "grad_norm": 0.3094232976436615, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0197, + "step": 4040 + }, + { + "epoch": 2.266368214885283, + "grad_norm": 0.3238268196582794, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0242, + "step": 4050 + }, + { + "epoch": 2.271964185786234, + "grad_norm": 0.37398698925971985, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0225, + "step": 4060 + }, + { + "epoch": 2.2775601566871853, + "grad_norm": 0.22411245107650757, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0259, + "step": 4070 + }, + { + "epoch": 2.2831561275881365, + "grad_norm": 0.2310367226600647, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0198, + "step": 4080 + }, + { + "epoch": 2.2887520984890877, + "grad_norm": 0.4910151958465576, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0234, + "step": 4090 + }, + { + "epoch": 2.2943480693900393, + "grad_norm": 0.2820461392402649, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0176, + "step": 4100 + }, + { + "epoch": 2.2999440402909905, + "grad_norm": 0.22990214824676514, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0185, + "step": 4110 + }, + { + "epoch": 2.3055400111919417, + "grad_norm": 0.33790138363838196, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0178, + "step": 4120 + }, + { + "epoch": 2.311135982092893, + "grad_norm": 0.3388676345348358, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0147, + "step": 4130 + }, + { + "epoch": 2.3167319529938446, + "grad_norm": 0.36007586121559143, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0185, + "step": 4140 + }, + { + "epoch": 2.322327923894796, + "grad_norm": 0.41096752882003784, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0219, + "step": 4150 + }, + { + "epoch": 2.327923894795747, + "grad_norm": 0.2878301441669464, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0159, + "step": 4160 + }, + { + "epoch": 2.3335198656966982, + "grad_norm": 0.32061803340911865, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 2.33911583659765, + "grad_norm": 0.29178762435913086, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0178, + "step": 4180 + }, + { + "epoch": 2.344711807498601, + "grad_norm": 0.32889455556869507, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0194, + "step": 4190 + }, + { + "epoch": 2.3503077783995523, + "grad_norm": 0.2980196475982666, + "learning_rate": 9.316282404787871e-05, + "loss": 0.015, + "step": 4200 + }, + { + "epoch": 2.3559037493005035, + "grad_norm": 0.21256855130195618, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0151, + "step": 4210 + }, + { + "epoch": 2.361499720201455, + "grad_norm": 0.2378161996603012, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0179, + "step": 4220 + }, + { + "epoch": 2.3670956911024064, + "grad_norm": 0.211124449968338, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0147, + "step": 4230 + }, + { + "epoch": 2.3726916620033576, + "grad_norm": 0.3496321439743042, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0144, + "step": 4240 + }, + { + "epoch": 2.378287632904309, + "grad_norm": 0.2865016758441925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0209, + "step": 4250 + }, + { + "epoch": 2.38388360380526, + "grad_norm": 0.22519885003566742, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0177, + "step": 4260 + }, + { + "epoch": 2.3894795747062116, + "grad_norm": 0.41060182452201843, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0169, + "step": 4270 + }, + { + "epoch": 2.395075545607163, + "grad_norm": 0.6265867352485657, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0189, + "step": 4280 + }, + { + "epoch": 2.400671516508114, + "grad_norm": 0.3811153173446655, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0274, + "step": 4290 + }, + { + "epoch": 2.4062674874090657, + "grad_norm": 0.2686716318130493, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0198, + "step": 4300 + }, + { + "epoch": 2.411863458310017, + "grad_norm": 0.31025633215904236, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0159, + "step": 4310 + }, + { + "epoch": 2.417459429210968, + "grad_norm": 0.23998180031776428, + "learning_rate": 9.265359203611987e-05, + "loss": 0.018, + "step": 4320 + }, + { + "epoch": 2.4230554001119193, + "grad_norm": 0.45635882019996643, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0199, + "step": 4330 + }, + { + "epoch": 2.4286513710128705, + "grad_norm": 0.34626588225364685, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0169, + "step": 4340 + }, + { + "epoch": 2.434247341913822, + "grad_norm": 0.27278828620910645, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0173, + "step": 4350 + }, + { + "epoch": 2.4398433128147734, + "grad_norm": 0.5236303806304932, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0226, + "step": 4360 + }, + { + "epoch": 2.4454392837157246, + "grad_norm": 0.27782773971557617, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0194, + "step": 4370 + }, + { + "epoch": 2.451035254616676, + "grad_norm": 0.280048131942749, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0174, + "step": 4380 + }, + { + "epoch": 2.4566312255176275, + "grad_norm": 0.3045734763145447, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0148, + "step": 4390 + }, + { + "epoch": 2.4622271964185787, + "grad_norm": 0.1700965315103531, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0155, + "step": 4400 + }, + { + "epoch": 2.46782316731953, + "grad_norm": 0.3037347197532654, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.473419138220481, + "grad_norm": 0.29750266671180725, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0149, + "step": 4420 + }, + { + "epoch": 2.4790151091214327, + "grad_norm": 0.1919635832309723, + "learning_rate": 9.217203991462815e-05, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 2.484611080022384, + "grad_norm": 0.2919257879257202, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 2.490207050923335, + "grad_norm": 0.17676684260368347, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0175, + "step": 4450 + }, + { + "epoch": 2.4958030218242864, + "grad_norm": 0.24397723376750946, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0179, + "step": 4460 + }, + { + "epoch": 2.501398992725238, + "grad_norm": 0.32645362615585327, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0179, + "step": 4470 + }, + { + "epoch": 2.5069949636261892, + "grad_norm": 0.35162001848220825, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0174, + "step": 4480 + }, + { + "epoch": 2.5125909345271404, + "grad_norm": 0.4019016623497009, + "learning_rate": 9.190348478655724e-05, + "loss": 0.015, + "step": 4490 + }, + { + "epoch": 2.5181869054280916, + "grad_norm": 0.4017965495586395, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0238, + "step": 4500 + }, + { + "epoch": 2.523782876329043, + "grad_norm": 0.41645774245262146, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0143, + "step": 4510 + }, + { + "epoch": 2.5293788472299945, + "grad_norm": 0.28400033712387085, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0196, + "step": 4520 + }, + { + "epoch": 2.5349748181309457, + "grad_norm": 0.4045359492301941, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0191, + "step": 4530 + }, + { + "epoch": 2.540570789031897, + "grad_norm": 0.37660202383995056, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0138, + "step": 4540 + }, + { + "epoch": 2.5461667599328486, + "grad_norm": 0.35835906863212585, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 2.5517627308338, + "grad_norm": 0.3906223177909851, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0204, + "step": 4560 + }, + { + "epoch": 2.557358701734751, + "grad_norm": 0.23904386162757874, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0193, + "step": 4570 + }, + { + "epoch": 2.562954672635702, + "grad_norm": 0.3690219521522522, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0218, + "step": 4580 + }, + { + "epoch": 2.5685506435366534, + "grad_norm": 0.3098298907279968, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0142, + "step": 4590 + }, + { + "epoch": 2.574146614437605, + "grad_norm": 0.5726227164268494, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0168, + "step": 4600 + }, + { + "epoch": 2.5797425853385563, + "grad_norm": 0.32549935579299927, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0228, + "step": 4610 + }, + { + "epoch": 2.5853385562395075, + "grad_norm": 0.35607558488845825, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 2.590934527140459, + "grad_norm": 0.31833362579345703, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0134, + "step": 4630 + }, + { + "epoch": 2.5965304980414103, + "grad_norm": 0.5075991749763489, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0181, + "step": 4640 + }, + { + "epoch": 2.6021264689423615, + "grad_norm": 0.2868656814098358, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0216, + "step": 4650 + }, + { + "epoch": 2.6077224398433128, + "grad_norm": 0.38551998138427734, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0218, + "step": 4660 + }, + { + "epoch": 2.613318410744264, + "grad_norm": 0.3080727756023407, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0263, + "step": 4670 + }, + { + "epoch": 2.618914381645215, + "grad_norm": 0.2743169665336609, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0143, + "step": 4680 + }, + { + "epoch": 2.624510352546167, + "grad_norm": 0.286101758480072, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0219, + "step": 4690 + }, + { + "epoch": 2.630106323447118, + "grad_norm": 0.1881791204214096, + "learning_rate": 9.093124073433463e-05, + "loss": 0.015, + "step": 4700 + }, + { + "epoch": 2.6357022943480692, + "grad_norm": 0.3556104004383087, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0207, + "step": 4710 + }, + { + "epoch": 2.641298265249021, + "grad_norm": 0.2784225344657898, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0159, + "step": 4720 + }, + { + "epoch": 2.646894236149972, + "grad_norm": 0.22262175381183624, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0162, + "step": 4730 + }, + { + "epoch": 2.6524902070509233, + "grad_norm": 0.16783557832241058, + "learning_rate": 9.074041986463808e-05, + "loss": 0.018, + "step": 4740 + }, + { + "epoch": 2.6580861779518745, + "grad_norm": 0.31983381509780884, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0168, + "step": 4750 + }, + { + "epoch": 2.6636821488528257, + "grad_norm": 0.2954675555229187, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0157, + "step": 4760 + }, + { + "epoch": 2.6692781197537774, + "grad_norm": 0.37835440039634705, + "learning_rate": 9.059613423804623e-05, + "loss": 0.016, + "step": 4770 + }, + { + "epoch": 2.6748740906547286, + "grad_norm": 0.30182933807373047, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0223, + "step": 4780 + }, + { + "epoch": 2.68047006155568, + "grad_norm": 0.3329738974571228, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0232, + "step": 4790 + }, + { + "epoch": 2.6860660324566314, + "grad_norm": 0.2866031527519226, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0193, + "step": 4800 + }, + { + "epoch": 2.6916620033575827, + "grad_norm": 0.3558676540851593, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0181, + "step": 4810 + }, + { + "epoch": 2.697257974258534, + "grad_norm": 0.22001361846923828, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0124, + "step": 4820 + }, + { + "epoch": 2.702853945159485, + "grad_norm": 0.28986766934394836, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0179, + "step": 4830 + }, + { + "epoch": 2.7084499160604363, + "grad_norm": 0.3889327347278595, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0186, + "step": 4840 + }, + { + "epoch": 2.714045886961388, + "grad_norm": 0.33833345770835876, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0161, + "step": 4850 + }, + { + "epoch": 2.719641857862339, + "grad_norm": 0.23896977305412292, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0149, + "step": 4860 + }, + { + "epoch": 2.7252378287632903, + "grad_norm": 0.44981443881988525, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 2.730833799664242, + "grad_norm": 0.4389462471008301, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0175, + "step": 4880 + }, + { + "epoch": 2.736429770565193, + "grad_norm": 0.2757073640823364, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0176, + "step": 4890 + }, + { + "epoch": 2.7420257414661444, + "grad_norm": 0.2381424754858017, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0145, + "step": 4900 + }, + { + "epoch": 2.7476217123670956, + "grad_norm": 0.25083616375923157, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0184, + "step": 4910 + }, + { + "epoch": 2.753217683268047, + "grad_norm": 0.3651309013366699, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0201, + "step": 4920 + }, + { + "epoch": 2.7588136541689985, + "grad_norm": 0.19562850892543793, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0158, + "step": 4930 + }, + { + "epoch": 2.7644096250699497, + "grad_norm": 0.646306037902832, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0172, + "step": 4940 + }, + { + "epoch": 2.770005595970901, + "grad_norm": 0.5771059393882751, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0181, + "step": 4950 + }, + { + "epoch": 2.775601566871852, + "grad_norm": 0.2918018400669098, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0199, + "step": 4960 + }, + { + "epoch": 2.7811975377728038, + "grad_norm": 0.5034765601158142, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0172, + "step": 4970 + }, + { + "epoch": 2.786793508673755, + "grad_norm": 0.29646632075309753, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0147, + "step": 4980 + }, + { + "epoch": 2.792389479574706, + "grad_norm": 0.2613969147205353, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0164, + "step": 4990 + }, + { + "epoch": 2.7979854504756574, + "grad_norm": 0.27573442459106445, + "learning_rate": 8.945702546981969e-05, + "loss": 0.018, + "step": 5000 + }, + { + "epoch": 2.8035814213766086, + "grad_norm": 0.33170339465141296, + "learning_rate": 8.940619244685388e-05, + "loss": 0.019, + "step": 5010 + }, + { + "epoch": 2.8091773922775602, + "grad_norm": 0.2994827628135681, + "learning_rate": 8.935525168886262e-05, + "loss": 0.019, + "step": 5020 + }, + { + "epoch": 2.8147733631785115, + "grad_norm": 0.3199397921562195, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0172, + "step": 5030 + }, + { + "epoch": 2.8203693340794627, + "grad_norm": 0.24537423253059387, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0146, + "step": 5040 + }, + { + "epoch": 2.8259653049804143, + "grad_norm": 0.24761222302913666, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0194, + "step": 5050 + }, + { + "epoch": 2.8315612758813655, + "grad_norm": 0.2208421230316162, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0123, + "step": 5060 + }, + { + "epoch": 2.8371572467823167, + "grad_norm": 0.3568471074104309, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0147, + "step": 5070 + }, + { + "epoch": 2.842753217683268, + "grad_norm": 0.24207855761051178, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0166, + "step": 5080 + }, + { + "epoch": 2.848349188584219, + "grad_norm": 0.47056907415390015, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0234, + "step": 5090 + }, + { + "epoch": 2.853945159485171, + "grad_norm": 0.26351991295814514, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0212, + "step": 5100 + }, + { + "epoch": 2.859541130386122, + "grad_norm": 0.2002822756767273, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0191, + "step": 5110 + }, + { + "epoch": 2.865137101287073, + "grad_norm": 0.28489527106285095, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0155, + "step": 5120 + }, + { + "epoch": 2.870733072188025, + "grad_norm": 0.30861204862594604, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0158, + "step": 5130 + }, + { + "epoch": 2.876329043088976, + "grad_norm": 0.2856840193271637, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0201, + "step": 5140 + }, + { + "epoch": 2.8819250139899273, + "grad_norm": 0.3461334705352783, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0184, + "step": 5150 + }, + { + "epoch": 2.8875209848908785, + "grad_norm": 0.22160184383392334, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0171, + "step": 5160 + }, + { + "epoch": 2.8931169557918297, + "grad_norm": 0.2488642781972885, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0153, + "step": 5170 + }, + { + "epoch": 2.8987129266927814, + "grad_norm": 0.33482569456100464, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 2.9043088975937326, + "grad_norm": 0.2865656316280365, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 2.9099048684946838, + "grad_norm": 0.3801150321960449, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0149, + "step": 5200 + }, + { + "epoch": 2.915500839395635, + "grad_norm": 0.24389003217220306, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 2.9210968102965866, + "grad_norm": 0.4815085828304291, + "learning_rate": 8.831402879132446e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 2.926692781197538, + "grad_norm": 0.2196839153766632, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0174, + "step": 5230 + }, + { + "epoch": 2.932288752098489, + "grad_norm": 0.30073830485343933, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0168, + "step": 5240 + }, + { + "epoch": 2.9378847229994403, + "grad_norm": 0.21486796438694, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0128, + "step": 5250 + }, + { + "epoch": 2.9434806939003915, + "grad_norm": 0.31880220770835876, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0209, + "step": 5260 + }, + { + "epoch": 2.949076664801343, + "grad_norm": 0.20475736260414124, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0152, + "step": 5270 + }, + { + "epoch": 2.9546726357022943, + "grad_norm": 0.19735224545001984, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0104, + "step": 5280 + }, + { + "epoch": 2.9602686066032455, + "grad_norm": 0.17013341188430786, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0129, + "step": 5290 + }, + { + "epoch": 2.965864577504197, + "grad_norm": 0.38702845573425293, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0239, + "step": 5300 + }, + { + "epoch": 2.9714605484051484, + "grad_norm": 0.34306514263153076, + "learning_rate": 8.783174018050594e-05, + "loss": 0.03, + "step": 5310 + }, + { + "epoch": 2.9770565193060996, + "grad_norm": 0.26854732632637024, + "learning_rate": 8.77776334424621e-05, + "loss": 0.019, + "step": 5320 + }, + { + "epoch": 2.982652490207051, + "grad_norm": 0.28458869457244873, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0213, + "step": 5330 + }, + { + "epoch": 2.988248461108002, + "grad_norm": 0.28708454966545105, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0173, + "step": 5340 + }, + { + "epoch": 2.9938444320089537, + "grad_norm": 0.35600361227989197, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0179, + "step": 5350 + }, + { + "epoch": 2.999440402909905, + "grad_norm": 0.29637375473976135, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0223, + "step": 5360 + }, + { + "epoch": 3.005036373810856, + "grad_norm": 0.39075925946235657, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0148, + "step": 5370 + }, + { + "epoch": 3.0106323447118073, + "grad_norm": 0.3552566468715668, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0187, + "step": 5380 + }, + { + "epoch": 3.016228315612759, + "grad_norm": 0.2608230710029602, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0205, + "step": 5390 + }, + { + "epoch": 3.02182428651371, + "grad_norm": 0.2771034240722656, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0187, + "step": 5400 + }, + { + "epoch": 3.0274202574146614, + "grad_norm": 0.2750489413738251, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0161, + "step": 5410 + }, + { + "epoch": 3.0330162283156126, + "grad_norm": 0.3373420834541321, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0193, + "step": 5420 + }, + { + "epoch": 3.0386121992165642, + "grad_norm": 0.27592456340789795, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0171, + "step": 5430 + }, + { + "epoch": 3.0442081701175154, + "grad_norm": 0.3381069004535675, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0185, + "step": 5440 + }, + { + "epoch": 3.0498041410184666, + "grad_norm": 0.342650830745697, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0223, + "step": 5450 + }, + { + "epoch": 3.055400111919418, + "grad_norm": 0.2777611017227173, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0135, + "step": 5460 + }, + { + "epoch": 3.0609960828203695, + "grad_norm": 0.26987946033477783, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0182, + "step": 5470 + }, + { + "epoch": 3.0665920537213207, + "grad_norm": 0.24877256155014038, + "learning_rate": 8.689798064925049e-05, + "loss": 0.015, + "step": 5480 + }, + { + "epoch": 3.072188024622272, + "grad_norm": 0.31654706597328186, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0142, + "step": 5490 + }, + { + "epoch": 3.077783995523223, + "grad_norm": 0.22976505756378174, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0119, + "step": 5500 + }, + { + "epoch": 3.083379966424175, + "grad_norm": 0.3443313241004944, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0138, + "step": 5510 + }, + { + "epoch": 3.088975937325126, + "grad_norm": 0.34815511107444763, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0127, + "step": 5520 + }, + { + "epoch": 3.094571908226077, + "grad_norm": 0.392868310213089, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0148, + "step": 5530 + }, + { + "epoch": 3.1001678791270284, + "grad_norm": 0.15690505504608154, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0158, + "step": 5540 + }, + { + "epoch": 3.10576385002798, + "grad_norm": 0.2958482503890991, + "learning_rate": 8.650497541989482e-05, + "loss": 0.015, + "step": 5550 + }, + { + "epoch": 3.1113598209289313, + "grad_norm": 0.34652698040008545, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0186, + "step": 5560 + }, + { + "epoch": 3.1169557918298825, + "grad_norm": 0.2787473201751709, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0171, + "step": 5570 + }, + { + "epoch": 3.1225517627308337, + "grad_norm": 0.28770115971565247, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0088, + "step": 5580 + }, + { + "epoch": 3.128147733631785, + "grad_norm": 0.16269604861736298, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0144, + "step": 5590 + }, + { + "epoch": 3.1337437045327365, + "grad_norm": 0.2170538753271103, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0145, + "step": 5600 + }, + { + "epoch": 3.1393396754336877, + "grad_norm": 0.1933916211128235, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0145, + "step": 5610 + }, + { + "epoch": 3.144935646334639, + "grad_norm": 0.28321388363838196, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0171, + "step": 5620 + }, + { + "epoch": 3.1505316172355906, + "grad_norm": 0.1729007363319397, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0103, + "step": 5630 + }, + { + "epoch": 3.156127588136542, + "grad_norm": 0.41079893708229065, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0159, + "step": 5640 + }, + { + "epoch": 3.161723559037493, + "grad_norm": 0.4628431797027588, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0184, + "step": 5650 + }, + { + "epoch": 3.1673195299384442, + "grad_norm": 0.30907726287841797, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0183, + "step": 5660 + }, + { + "epoch": 3.1729155008393954, + "grad_norm": 0.19282157719135284, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0127, + "step": 5670 + }, + { + "epoch": 3.178511471740347, + "grad_norm": 0.27166658639907837, + "learning_rate": 8.576217467724128e-05, + "loss": 0.023, + "step": 5680 + }, + { + "epoch": 3.1841074426412983, + "grad_norm": 0.3486577272415161, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0108, + "step": 5690 + }, + { + "epoch": 3.1897034135422495, + "grad_norm": 0.295238733291626, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.1952993844432007, + "grad_norm": 0.20616333186626434, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0202, + "step": 5710 + }, + { + "epoch": 3.2008953553441524, + "grad_norm": 0.12979304790496826, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 3.2064913262451036, + "grad_norm": 0.23997394740581512, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0159, + "step": 5730 + }, + { + "epoch": 3.212087297146055, + "grad_norm": 0.2359701246023178, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0136, + "step": 5740 + }, + { + "epoch": 3.217683268047006, + "grad_norm": 0.25309842824935913, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0154, + "step": 5750 + }, + { + "epoch": 3.2232792389479576, + "grad_norm": 0.26648661494255066, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0132, + "step": 5760 + }, + { + "epoch": 3.228875209848909, + "grad_norm": 0.32268235087394714, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0133, + "step": 5770 + }, + { + "epoch": 3.23447118074986, + "grad_norm": 0.2632688283920288, + "learning_rate": 8.517952785058385e-05, + "loss": 0.017, + "step": 5780 + }, + { + "epoch": 3.2400671516508113, + "grad_norm": 0.16985219717025757, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0143, + "step": 5790 + }, + { + "epoch": 3.245663122551763, + "grad_norm": 0.23951981961727142, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0157, + "step": 5800 + }, + { + "epoch": 3.251259093452714, + "grad_norm": 0.36843812465667725, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0198, + "step": 5810 + }, + { + "epoch": 3.2568550643536653, + "grad_norm": 0.27591267228126526, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0246, + "step": 5820 + }, + { + "epoch": 3.2624510352546165, + "grad_norm": 0.3020281195640564, + "learning_rate": 8.488458772904684e-05, + "loss": 0.018, + "step": 5830 + }, + { + "epoch": 3.2680470061555678, + "grad_norm": 0.20429036021232605, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0154, + "step": 5840 + }, + { + "epoch": 3.2736429770565194, + "grad_norm": 0.3011918067932129, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0181, + "step": 5850 + }, + { + "epoch": 3.2792389479574706, + "grad_norm": 0.20082388818264008, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0118, + "step": 5860 + }, + { + "epoch": 3.284834918858422, + "grad_norm": 0.25404563546180725, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0122, + "step": 5870 + }, + { + "epoch": 3.2904308897593735, + "grad_norm": 0.17162342369556427, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0178, + "step": 5880 + }, + { + "epoch": 3.2960268606603247, + "grad_norm": 0.2713855803012848, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0127, + "step": 5890 + }, + { + "epoch": 3.301622831561276, + "grad_norm": 0.25792196393013, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0151, + "step": 5900 + }, + { + "epoch": 3.307218802462227, + "grad_norm": 0.24708054959774017, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0205, + "step": 5910 + }, + { + "epoch": 3.3128147733631783, + "grad_norm": 0.22907878458499908, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0196, + "step": 5920 + }, + { + "epoch": 3.31841074426413, + "grad_norm": 0.42451682686805725, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0181, + "step": 5930 + }, + { + "epoch": 3.324006715165081, + "grad_norm": 0.2787477970123291, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0163, + "step": 5940 + }, + { + "epoch": 3.3296026860660324, + "grad_norm": 0.2536604404449463, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0153, + "step": 5950 + }, + { + "epoch": 3.3351986569669836, + "grad_norm": 0.27685803174972534, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0171, + "step": 5960 + }, + { + "epoch": 3.3407946278679352, + "grad_norm": 0.21129871904850006, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0146, + "step": 5970 + }, + { + "epoch": 3.3463905987688864, + "grad_norm": 0.2712884247303009, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0124, + "step": 5980 + }, + { + "epoch": 3.3519865696698377, + "grad_norm": 0.28807780146598816, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0151, + "step": 5990 + }, + { + "epoch": 3.357582540570789, + "grad_norm": 0.24215184152126312, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 3.3631785114717405, + "grad_norm": 0.3111182451248169, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0148, + "step": 6010 + }, + { + "epoch": 3.3687744823726917, + "grad_norm": 0.3122502267360687, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0138, + "step": 6020 + }, + { + "epoch": 3.374370453273643, + "grad_norm": 0.23829977214336395, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0172, + "step": 6030 + }, + { + "epoch": 3.379966424174594, + "grad_norm": 0.22568489611148834, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0191, + "step": 6040 + }, + { + "epoch": 3.385562395075546, + "grad_norm": 0.37260109186172485, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0146, + "step": 6050 + }, + { + "epoch": 3.391158365976497, + "grad_norm": 0.36467012763023376, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0144, + "step": 6060 + }, + { + "epoch": 3.396754336877448, + "grad_norm": 0.28992265462875366, + "learning_rate": 8.343604577838964e-05, + "loss": 0.014, + "step": 6070 + }, + { + "epoch": 3.4023503077783994, + "grad_norm": 0.3018409311771393, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0126, + "step": 6080 + }, + { + "epoch": 3.4079462786793506, + "grad_norm": 0.31771036982536316, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0124, + "step": 6090 + }, + { + "epoch": 3.4135422495803023, + "grad_norm": 0.2008838802576065, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0181, + "step": 6100 + }, + { + "epoch": 3.4191382204812535, + "grad_norm": 0.3000880777835846, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0178, + "step": 6110 + }, + { + "epoch": 3.4247341913822047, + "grad_norm": 0.201462984085083, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0121, + "step": 6120 + }, + { + "epoch": 3.4303301622831563, + "grad_norm": 0.29394298791885376, + "learning_rate": 8.306559326618259e-05, + "loss": 0.019, + "step": 6130 + }, + { + "epoch": 3.4359261331841076, + "grad_norm": 0.20683641731739044, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0157, + "step": 6140 + }, + { + "epoch": 3.4415221040850588, + "grad_norm": 0.2323373705148697, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0132, + "step": 6150 + }, + { + "epoch": 3.44711807498601, + "grad_norm": 0.28800690174102783, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0149, + "step": 6160 + }, + { + "epoch": 3.452714045886961, + "grad_norm": 0.24825571477413177, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 3.458310016787913, + "grad_norm": 0.26586174964904785, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0193, + "step": 6180 + }, + { + "epoch": 3.463905987688864, + "grad_norm": 0.384670615196228, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0135, + "step": 6190 + }, + { + "epoch": 3.4695019585898152, + "grad_norm": 0.2598379850387573, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0192, + "step": 6200 + }, + { + "epoch": 3.4750979294907665, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.256660056776076e-05, + "loss": 0.017, + "step": 6210 + }, + { + "epoch": 3.480693900391718, + "grad_norm": 0.29601970314979553, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0236, + "step": 6220 + }, + { + "epoch": 3.4862898712926693, + "grad_norm": 0.2569962739944458, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0148, + "step": 6230 + }, + { + "epoch": 3.4918858421936205, + "grad_norm": 0.18870459496974945, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0166, + "step": 6240 + }, + { + "epoch": 3.4974818130945717, + "grad_norm": 0.20874905586242676, + "learning_rate": 8.231496189304704e-05, + "loss": 0.012, + "step": 6250 + }, + { + "epoch": 3.5030777839955234, + "grad_norm": 0.456989586353302, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 3.5086737548964746, + "grad_norm": 0.3724716305732727, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0164, + "step": 6270 + }, + { + "epoch": 3.514269725797426, + "grad_norm": 0.2510260343551636, + "learning_rate": 8.212530463322583e-05, + "loss": 0.014, + "step": 6280 + }, + { + "epoch": 3.519865696698377, + "grad_norm": 0.17292679846286774, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0135, + "step": 6290 + }, + { + "epoch": 3.5254616675993287, + "grad_norm": 0.25856831669807434, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0159, + "step": 6300 + }, + { + "epoch": 3.53105763850028, + "grad_norm": 0.26525381207466125, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0132, + "step": 6310 + }, + { + "epoch": 3.536653609401231, + "grad_norm": 0.319915235042572, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0113, + "step": 6320 + }, + { + "epoch": 3.5422495803021823, + "grad_norm": 0.23749981820583344, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0145, + "step": 6330 + }, + { + "epoch": 3.5478455512031335, + "grad_norm": 0.25086531043052673, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0165, + "step": 6340 + }, + { + "epoch": 3.553441522104085, + "grad_norm": 0.19675312936306, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "epoch": 3.5590374930050364, + "grad_norm": 0.2085702270269394, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0155, + "step": 6360 + }, + { + "epoch": 3.5646334639059876, + "grad_norm": 0.4404468536376953, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0208, + "step": 6370 + }, + { + "epoch": 3.570229434806939, + "grad_norm": 0.10625205188989639, + "learning_rate": 8.148743122865463e-05, + "loss": 0.015, + "step": 6380 + }, + { + "epoch": 3.5758254057078904, + "grad_norm": 0.34253987669944763, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0229, + "step": 6390 + }, + { + "epoch": 3.5814213766088416, + "grad_norm": 0.43956324458122253, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 3.587017347509793, + "grad_norm": 0.45199209451675415, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0198, + "step": 6410 + }, + { + "epoch": 3.592613318410744, + "grad_norm": 0.2245771586894989, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0141, + "step": 6420 + }, + { + "epoch": 3.5982092893116957, + "grad_norm": 0.3338348865509033, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0168, + "step": 6430 + }, + { + "epoch": 3.603805260212647, + "grad_norm": 0.21632985770702362, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0117, + "step": 6440 + }, + { + "epoch": 3.609401231113598, + "grad_norm": 0.2893829643726349, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 3.6149972020145498, + "grad_norm": 0.24873918294906616, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0139, + "step": 6460 + }, + { + "epoch": 3.620593172915501, + "grad_norm": 0.31232985854148865, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0122, + "step": 6470 + }, + { + "epoch": 3.626189143816452, + "grad_norm": 0.20202654600143433, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0126, + "step": 6480 + }, + { + "epoch": 3.6317851147174034, + "grad_norm": 0.339890718460083, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0107, + "step": 6490 + }, + { + "epoch": 3.6373810856183546, + "grad_norm": 0.17959007620811462, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0125, + "step": 6500 + }, + { + "epoch": 3.6429770565193063, + "grad_norm": 0.21795189380645752, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 3.6485730274202575, + "grad_norm": 0.17131085693836212, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0197, + "step": 6520 + }, + { + "epoch": 3.6541689983212087, + "grad_norm": 0.180596724152565, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0128, + "step": 6530 + }, + { + "epoch": 3.65976496922216, + "grad_norm": 0.23086079955101013, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0171, + "step": 6540 + }, + { + "epoch": 3.6653609401231115, + "grad_norm": 0.40819284319877625, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0162, + "step": 6550 + }, + { + "epoch": 3.6709569110240627, + "grad_norm": 0.20544512569904327, + "learning_rate": 8.031768475274413e-05, + "loss": 0.01, + "step": 6560 + }, + { + "epoch": 3.676552881925014, + "grad_norm": 0.3116811513900757, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0183, + "step": 6570 + }, + { + "epoch": 3.682148852825965, + "grad_norm": 0.3111719787120819, + "learning_rate": 8.018603611327504e-05, + "loss": 0.015, + "step": 6580 + }, + { + "epoch": 3.6877448237269164, + "grad_norm": 0.20265722274780273, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0136, + "step": 6590 + }, + { + "epoch": 3.693340794627868, + "grad_norm": 0.35717812180519104, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0098, + "step": 6600 + }, + { + "epoch": 3.6989367655288192, + "grad_norm": 0.45737767219543457, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0115, + "step": 6610 + }, + { + "epoch": 3.7045327364297704, + "grad_norm": 0.3025696873664856, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0159, + "step": 6620 + }, + { + "epoch": 3.710128707330722, + "grad_norm": 0.3852231502532959, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0116, + "step": 6630 + }, + { + "epoch": 3.7157246782316733, + "grad_norm": 0.23505637049674988, + "learning_rate": 7.978911531372765e-05, + "loss": 0.012, + "step": 6640 + }, + { + "epoch": 3.7213206491326245, + "grad_norm": 0.16072528064250946, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0101, + "step": 6650 + }, + { + "epoch": 3.7269166200335757, + "grad_norm": 0.2579629719257355, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 3.732512590934527, + "grad_norm": 0.170463427901268, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0164, + "step": 6670 + }, + { + "epoch": 3.7381085618354786, + "grad_norm": 0.2048628181219101, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0095, + "step": 6680 + }, + { + "epoch": 3.74370453273643, + "grad_norm": 0.1665850281715393, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0131, + "step": 6690 + }, + { + "epoch": 3.749300503637381, + "grad_norm": 0.184804305434227, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0161, + "step": 6700 + }, + { + "epoch": 3.7548964745383326, + "grad_norm": 0.17109259963035583, + "learning_rate": 7.932233821142987e-05, + "loss": 0.014, + "step": 6710 + }, + { + "epoch": 3.760492445439284, + "grad_norm": 0.23285003006458282, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 3.766088416340235, + "grad_norm": 0.21361905336380005, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0218, + "step": 6730 + }, + { + "epoch": 3.7716843872411863, + "grad_norm": 0.22354750335216522, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0203, + "step": 6740 + }, + { + "epoch": 3.7772803581421375, + "grad_norm": 0.24767528474330902, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0193, + "step": 6750 + }, + { + "epoch": 3.782876329043089, + "grad_norm": 0.18995364010334015, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0162, + "step": 6760 + }, + { + "epoch": 3.7884722999440403, + "grad_norm": 0.13995826244354248, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0187, + "step": 6770 + }, + { + "epoch": 3.7940682708449915, + "grad_norm": 0.2525804340839386, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0136, + "step": 6780 + }, + { + "epoch": 3.799664241745943, + "grad_norm": 0.17206352949142456, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0133, + "step": 6790 + }, + { + "epoch": 3.8052602126468944, + "grad_norm": 0.17433176934719086, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0257, + "step": 6800 + }, + { + "epoch": 3.8108561835478456, + "grad_norm": 0.2698834240436554, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0141, + "step": 6810 + }, + { + "epoch": 3.816452154448797, + "grad_norm": 0.2874978482723236, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0175, + "step": 6820 + }, + { + "epoch": 3.822048125349748, + "grad_norm": 0.267092227935791, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0169, + "step": 6830 + }, + { + "epoch": 3.8276440962506992, + "grad_norm": 0.31751275062561035, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0175, + "step": 6840 + }, + { + "epoch": 3.833240067151651, + "grad_norm": 0.30981171131134033, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0145, + "step": 6850 + }, + { + "epoch": 3.838836038052602, + "grad_norm": 0.31560707092285156, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0157, + "step": 6860 + }, + { + "epoch": 3.8444320089535533, + "grad_norm": 0.22295020520687103, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0108, + "step": 6870 + }, + { + "epoch": 3.850027979854505, + "grad_norm": 0.25469842553138733, + "learning_rate": 7.817250808190483e-05, + "loss": 0.015, + "step": 6880 + }, + { + "epoch": 3.855623950755456, + "grad_norm": 0.3890667259693146, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0179, + "step": 6890 + }, + { + "epoch": 3.8612199216564074, + "grad_norm": 0.1923862248659134, + "learning_rate": 7.803575286758364e-05, + "loss": 0.013, + "step": 6900 + }, + { + "epoch": 3.8668158925573586, + "grad_norm": 0.17686985433101654, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0143, + "step": 6910 + }, + { + "epoch": 3.87241186345831, + "grad_norm": 0.1899517923593521, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0178, + "step": 6920 + }, + { + "epoch": 3.8780078343592614, + "grad_norm": 0.3056480586528778, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0129, + "step": 6930 + }, + { + "epoch": 3.8836038052602126, + "grad_norm": 0.27795109152793884, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 3.889199776161164, + "grad_norm": 0.22460781037807465, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0145, + "step": 6950 + }, + { + "epoch": 3.8947957470621155, + "grad_norm": 0.29980891942977905, + "learning_rate": 7.762365365649067e-05, + "loss": 0.015, + "step": 6960 + }, + { + "epoch": 3.9003917179630667, + "grad_norm": 0.2440609186887741, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0137, + "step": 6970 + }, + { + "epoch": 3.905987688864018, + "grad_norm": 0.2510973811149597, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0118, + "step": 6980 + }, + { + "epoch": 3.911583659764969, + "grad_norm": 0.4981507956981659, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0217, + "step": 6990 + }, + { + "epoch": 3.9171796306659203, + "grad_norm": 0.28161290287971497, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0154, + "step": 7000 + }, + { + "epoch": 3.922775601566872, + "grad_norm": 0.40513697266578674, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0169, + "step": 7010 + }, + { + "epoch": 3.928371572467823, + "grad_norm": 0.31741997599601746, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0156, + "step": 7020 + }, + { + "epoch": 3.9339675433687744, + "grad_norm": 0.2534908652305603, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0142, + "step": 7030 + }, + { + "epoch": 3.939563514269726, + "grad_norm": 0.2655825912952423, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0154, + "step": 7040 + }, + { + "epoch": 3.9451594851706773, + "grad_norm": 0.32799914479255676, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0137, + "step": 7050 + }, + { + "epoch": 3.9507554560716285, + "grad_norm": 0.184087872505188, + "learning_rate": 7.693080007570084e-05, + "loss": 0.013, + "step": 7060 + }, + { + "epoch": 3.9563514269725797, + "grad_norm": 0.31337958574295044, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0203, + "step": 7070 + }, + { + "epoch": 3.961947397873531, + "grad_norm": 0.44696512818336487, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0136, + "step": 7080 + }, + { + "epoch": 3.967543368774482, + "grad_norm": 0.2737766206264496, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0157, + "step": 7090 + }, + { + "epoch": 3.9731393396754338, + "grad_norm": 0.4152137339115143, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0151, + "step": 7100 + }, + { + "epoch": 3.978735310576385, + "grad_norm": 0.25766709446907043, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0185, + "step": 7110 + }, + { + "epoch": 3.984331281477336, + "grad_norm": 0.2175714522600174, + "learning_rate": 7.651154166637025e-05, + "loss": 0.013, + "step": 7120 + }, + { + "epoch": 3.989927252378288, + "grad_norm": 0.2838795483112335, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0152, + "step": 7130 + }, + { + "epoch": 3.995523223279239, + "grad_norm": 0.17076176404953003, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0161, + "step": 7140 + }, + { + "epoch": 4.00111919418019, + "grad_norm": 0.34454286098480225, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0155, + "step": 7150 + }, + { + "epoch": 4.0067151650811414, + "grad_norm": 0.2543468773365021, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0224, + "step": 7160 + }, + { + "epoch": 4.012311135982093, + "grad_norm": 0.26474493741989136, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0121, + "step": 7170 + }, + { + "epoch": 4.017907106883044, + "grad_norm": 0.2469242513179779, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0168, + "step": 7180 + }, + { + "epoch": 4.023503077783996, + "grad_norm": 0.2605207562446594, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0142, + "step": 7190 + }, + { + "epoch": 4.029099048684947, + "grad_norm": 0.1799083948135376, + "learning_rate": 7.594847868906076e-05, + "loss": 0.02, + "step": 7200 + }, + { + "epoch": 4.034695019585898, + "grad_norm": 0.179059699177742, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0115, + "step": 7210 + }, + { + "epoch": 4.04029099048685, + "grad_norm": 0.2233004868030548, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0128, + "step": 7220 + }, + { + "epoch": 4.045886961387801, + "grad_norm": 0.253635436296463, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0149, + "step": 7230 + }, + { + "epoch": 4.051482932288752, + "grad_norm": 0.3416047692298889, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0125, + "step": 7240 + }, + { + "epoch": 4.057078903189703, + "grad_norm": 0.27430468797683716, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0153, + "step": 7250 + }, + { + "epoch": 4.062674874090654, + "grad_norm": 0.26396802067756653, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0128, + "step": 7260 + }, + { + "epoch": 4.068270844991606, + "grad_norm": 0.1688843071460724, + "learning_rate": 7.545207078751857e-05, + "loss": 0.017, + "step": 7270 + }, + { + "epoch": 4.073866815892558, + "grad_norm": 0.25092509388923645, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0119, + "step": 7280 + }, + { + "epoch": 4.079462786793509, + "grad_norm": 0.12876421213150024, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0099, + "step": 7290 + }, + { + "epoch": 4.08505875769446, + "grad_norm": 0.13818064332008362, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0156, + "step": 7300 + }, + { + "epoch": 4.090654728595411, + "grad_norm": 0.23580847680568695, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0122, + "step": 7310 + }, + { + "epoch": 4.096250699496363, + "grad_norm": 0.22529348731040955, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0115, + "step": 7320 + }, + { + "epoch": 4.101846670397314, + "grad_norm": 0.29066744446754456, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0125, + "step": 7330 + }, + { + "epoch": 4.107442641298265, + "grad_norm": 0.30195966362953186, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "epoch": 4.113038612199216, + "grad_norm": 0.2478567361831665, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0138, + "step": 7350 + }, + { + "epoch": 4.118634583100168, + "grad_norm": 0.23493291437625885, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0171, + "step": 7360 + }, + { + "epoch": 4.1242305540011195, + "grad_norm": 0.28376439213752747, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0161, + "step": 7370 + }, + { + "epoch": 4.129826524902071, + "grad_norm": 0.183238685131073, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0159, + "step": 7380 + }, + { + "epoch": 4.135422495803022, + "grad_norm": 0.26259323954582214, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0139, + "step": 7390 + }, + { + "epoch": 4.141018466703973, + "grad_norm": 0.31283116340637207, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0103, + "step": 7400 + }, + { + "epoch": 4.146614437604924, + "grad_norm": 0.3131321370601654, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0147, + "step": 7410 + }, + { + "epoch": 4.1522104085058755, + "grad_norm": 0.22739440202713013, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0199, + "step": 7420 + }, + { + "epoch": 4.157806379406827, + "grad_norm": 0.22918283939361572, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0152, + "step": 7430 + }, + { + "epoch": 4.163402350307779, + "grad_norm": 0.3530014455318451, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0123, + "step": 7440 + }, + { + "epoch": 4.16899832120873, + "grad_norm": 0.32133522629737854, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0151, + "step": 7450 + }, + { + "epoch": 4.174594292109681, + "grad_norm": 0.13515067100524902, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0123, + "step": 7460 + }, + { + "epoch": 4.1801902630106325, + "grad_norm": 0.39963120222091675, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0132, + "step": 7470 + }, + { + "epoch": 4.185786233911584, + "grad_norm": 0.16310429573059082, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0211, + "step": 7480 + }, + { + "epoch": 4.191382204812535, + "grad_norm": 0.23062337934970856, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0155, + "step": 7490 + }, + { + "epoch": 4.196978175713486, + "grad_norm": 0.3456437289714813, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0149, + "step": 7500 + }, + { + "epoch": 4.202574146614437, + "grad_norm": 0.30712154507637024, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0142, + "step": 7510 + }, + { + "epoch": 4.2081701175153885, + "grad_norm": 0.40980008244514465, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0185, + "step": 7520 + }, + { + "epoch": 4.213766088416341, + "grad_norm": 0.3277069330215454, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0202, + "step": 7530 + }, + { + "epoch": 4.219362059317292, + "grad_norm": 0.2991955280303955, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0129, + "step": 7540 + }, + { + "epoch": 4.224958030218243, + "grad_norm": 0.3370542526245117, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0186, + "step": 7550 + }, + { + "epoch": 4.230554001119194, + "grad_norm": 0.31613653898239136, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0123, + "step": 7560 + }, + { + "epoch": 4.236149972020145, + "grad_norm": 0.21174335479736328, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0088, + "step": 7570 + }, + { + "epoch": 4.241745942921097, + "grad_norm": 0.18430404365062714, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0136, + "step": 7580 + }, + { + "epoch": 4.247341913822048, + "grad_norm": 0.161945641040802, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0146, + "step": 7590 + }, + { + "epoch": 4.252937884722999, + "grad_norm": 0.3349175453186035, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0151, + "step": 7600 + }, + { + "epoch": 4.258533855623951, + "grad_norm": 0.22331948578357697, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0116, + "step": 7610 + }, + { + "epoch": 4.264129826524902, + "grad_norm": 0.32214659452438354, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0125, + "step": 7620 + }, + { + "epoch": 4.269725797425854, + "grad_norm": 0.2628123164176941, + "learning_rate": 7.284672583878219e-05, + "loss": 0.021, + "step": 7630 + }, + { + "epoch": 4.275321768326805, + "grad_norm": 0.17666281759738922, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0129, + "step": 7640 + }, + { + "epoch": 4.280917739227756, + "grad_norm": 0.13651759922504425, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0136, + "step": 7650 + }, + { + "epoch": 4.286513710128707, + "grad_norm": 0.19819198548793793, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0178, + "step": 7660 + }, + { + "epoch": 4.292109681029658, + "grad_norm": 0.30227622389793396, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0099, + "step": 7670 + }, + { + "epoch": 4.29770565193061, + "grad_norm": 0.1803039014339447, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0125, + "step": 7680 + }, + { + "epoch": 4.303301622831562, + "grad_norm": 0.2602524757385254, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0128, + "step": 7690 + }, + { + "epoch": 4.308897593732513, + "grad_norm": 0.22282052040100098, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0105, + "step": 7700 + }, + { + "epoch": 4.314493564633464, + "grad_norm": 0.3194449841976166, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0133, + "step": 7710 + }, + { + "epoch": 4.320089535534415, + "grad_norm": 0.31051668524742126, + "learning_rate": 7.218237771703921e-05, + "loss": 0.021, + "step": 7720 + }, + { + "epoch": 4.3256855064353665, + "grad_norm": 0.23389574885368347, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0151, + "step": 7730 + }, + { + "epoch": 4.331281477336318, + "grad_norm": 0.16604237258434296, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0101, + "step": 7740 + }, + { + "epoch": 4.336877448237269, + "grad_norm": 0.26793259382247925, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0177, + "step": 7750 + }, + { + "epoch": 4.34247341913822, + "grad_norm": 0.21598176658153534, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0168, + "step": 7760 + }, + { + "epoch": 4.348069390039171, + "grad_norm": 0.30887526273727417, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0122, + "step": 7770 + }, + { + "epoch": 4.3536653609401235, + "grad_norm": 0.3463345468044281, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0143, + "step": 7780 + }, + { + "epoch": 4.359261331841075, + "grad_norm": 0.26217085123062134, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 4.364857302742026, + "grad_norm": 0.28720608353614807, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0139, + "step": 7800 + }, + { + "epoch": 4.370453273642977, + "grad_norm": 0.35230302810668945, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0146, + "step": 7810 + }, + { + "epoch": 4.376049244543928, + "grad_norm": 0.2841963469982147, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0153, + "step": 7820 + }, + { + "epoch": 4.3816452154448795, + "grad_norm": 0.3889724016189575, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0147, + "step": 7830 + }, + { + "epoch": 4.387241186345831, + "grad_norm": 0.2717784345149994, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0161, + "step": 7840 + }, + { + "epoch": 4.392837157246782, + "grad_norm": 0.27939334511756897, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0127, + "step": 7850 + }, + { + "epoch": 4.398433128147734, + "grad_norm": 0.24780631065368652, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0134, + "step": 7860 + }, + { + "epoch": 4.404029099048685, + "grad_norm": 0.2736693024635315, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0157, + "step": 7870 + }, + { + "epoch": 4.409625069949636, + "grad_norm": 0.2332269549369812, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0155, + "step": 7880 + }, + { + "epoch": 4.415221040850588, + "grad_norm": 0.3542332947254181, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0161, + "step": 7890 + }, + { + "epoch": 4.420817011751539, + "grad_norm": 0.29242730140686035, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0137, + "step": 7900 + }, + { + "epoch": 4.42641298265249, + "grad_norm": 0.33528995513916016, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0146, + "step": 7910 + }, + { + "epoch": 4.432008953553441, + "grad_norm": 0.34565469622612, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0128, + "step": 7920 + }, + { + "epoch": 4.4376049244543925, + "grad_norm": 0.29550039768218994, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0143, + "step": 7930 + }, + { + "epoch": 4.443200895355345, + "grad_norm": 0.18918676674365997, + "learning_rate": 7.053803645765128e-05, + "loss": 0.017, + "step": 7940 + }, + { + "epoch": 4.448796866256296, + "grad_norm": 0.24842104315757751, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0115, + "step": 7950 + }, + { + "epoch": 4.454392837157247, + "grad_norm": 0.25395554304122925, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0136, + "step": 7960 + }, + { + "epoch": 4.459988808058198, + "grad_norm": 0.223357155919075, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0159, + "step": 7970 + }, + { + "epoch": 4.465584778959149, + "grad_norm": 0.2434312105178833, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0113, + "step": 7980 + }, + { + "epoch": 4.471180749860101, + "grad_norm": 0.27500098943710327, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0127, + "step": 7990 + }, + { + "epoch": 4.476776720761052, + "grad_norm": 0.1670360416173935, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0151, + "step": 8000 + }, + { + "epoch": 4.482372691662003, + "grad_norm": 0.3035995662212372, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0143, + "step": 8010 + }, + { + "epoch": 4.487968662562954, + "grad_norm": 0.25943461060523987, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 4.493564633463906, + "grad_norm": 0.20338699221611023, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0127, + "step": 8030 + }, + { + "epoch": 4.4991606043648575, + "grad_norm": 0.18308840692043304, + "learning_rate": 6.978149344295242e-05, + "loss": 0.012, + "step": 8040 + }, + { + "epoch": 4.504756575265809, + "grad_norm": 0.142523393034935, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0117, + "step": 8050 + }, + { + "epoch": 4.51035254616676, + "grad_norm": 0.26383474469184875, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0171, + "step": 8060 + }, + { + "epoch": 4.515948517067711, + "grad_norm": 0.1817890852689743, + "learning_rate": 6.955346651628771e-05, + "loss": 0.0147, + "step": 8070 + }, + { + "epoch": 4.521544487968662, + "grad_norm": 0.20679673552513123, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0161, + "step": 8080 + }, + { + "epoch": 4.527140458869614, + "grad_norm": 0.2073245346546173, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0104, + "step": 8090 + }, + { + "epoch": 4.532736429770566, + "grad_norm": 0.45759397745132446, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0141, + "step": 8100 + }, + { + "epoch": 4.538332400671517, + "grad_norm": 0.2275332510471344, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0144, + "step": 8110 + }, + { + "epoch": 4.543928371572468, + "grad_norm": 0.24839594960212708, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0153, + "step": 8120 + }, + { + "epoch": 4.549524342473419, + "grad_norm": 0.13045403361320496, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 4.5551203133743705, + "grad_norm": 0.298033207654953, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0148, + "step": 8140 + }, + { + "epoch": 4.560716284275322, + "grad_norm": 0.3102302849292755, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0104, + "step": 8150 + }, + { + "epoch": 4.566312255176273, + "grad_norm": 0.3511497378349304, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0114, + "step": 8160 + }, + { + "epoch": 4.571908226077224, + "grad_norm": 0.19204401969909668, + "learning_rate": 6.878993357458986e-05, + "loss": 0.0144, + "step": 8170 + }, + { + "epoch": 4.577504196978175, + "grad_norm": 0.27601921558380127, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0121, + "step": 8180 + }, + { + "epoch": 4.583100167879127, + "grad_norm": 0.15351536870002747, + "learning_rate": 6.863660384379017e-05, + "loss": 0.017, + "step": 8190 + }, + { + "epoch": 4.588696138780079, + "grad_norm": 0.34269094467163086, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0164, + "step": 8200 + }, + { + "epoch": 4.59429210968103, + "grad_norm": 0.20768719911575317, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0186, + "step": 8210 + }, + { + "epoch": 4.599888080581981, + "grad_norm": 0.29763510823249817, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0134, + "step": 8220 + }, + { + "epoch": 4.605484051482932, + "grad_norm": 0.29871609807014465, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0118, + "step": 8230 + }, + { + "epoch": 4.6110800223838835, + "grad_norm": 0.24642953276634216, + "learning_rate": 6.825239153500029e-05, + "loss": 0.015, + "step": 8240 + }, + { + "epoch": 4.616675993284835, + "grad_norm": 0.20664198696613312, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0165, + "step": 8250 + }, + { + "epoch": 4.622271964185786, + "grad_norm": 0.1941448450088501, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0129, + "step": 8260 + }, + { + "epoch": 4.627867935086737, + "grad_norm": 0.21355387568473816, + "learning_rate": 6.802126365266905e-05, + "loss": 0.013, + "step": 8270 + }, + { + "epoch": 4.633463905987689, + "grad_norm": 0.2642342746257782, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0176, + "step": 8280 + }, + { + "epoch": 4.63905987688864, + "grad_norm": 0.31280654668807983, + "learning_rate": 6.786693169949455e-05, + "loss": 0.017, + "step": 8290 + }, + { + "epoch": 4.644655847789592, + "grad_norm": 0.2257363200187683, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0099, + "step": 8300 + }, + { + "epoch": 4.650251818690543, + "grad_norm": 0.16536390781402588, + "learning_rate": 6.771240435641754e-05, + "loss": 0.012, + "step": 8310 + }, + { + "epoch": 4.655847789591494, + "grad_norm": 0.16031181812286377, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0094, + "step": 8320 + }, + { + "epoch": 4.661443760492445, + "grad_norm": 0.2519717514514923, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 4.6670397313933965, + "grad_norm": 0.11290234327316284, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0187, + "step": 8340 + }, + { + "epoch": 4.6726357022943485, + "grad_norm": 0.18607747554779053, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0123, + "step": 8350 + }, + { + "epoch": 4.6782316731953, + "grad_norm": 0.20653483271598816, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0128, + "step": 8360 + }, + { + "epoch": 4.683827644096251, + "grad_norm": 0.20888541638851166, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0235, + "step": 8370 + }, + { + "epoch": 4.689423614997202, + "grad_norm": 0.23816397786140442, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0125, + "step": 8380 + }, + { + "epoch": 4.695019585898153, + "grad_norm": 0.3250564932823181, + "learning_rate": 6.709237491447249e-05, + "loss": 0.011, + "step": 8390 + }, + { + "epoch": 4.700615556799105, + "grad_norm": 0.3211959898471832, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0124, + "step": 8400 + }, + { + "epoch": 4.706211527700056, + "grad_norm": 0.3432743549346924, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0119, + "step": 8410 + }, + { + "epoch": 4.711807498601007, + "grad_norm": 0.2595174014568329, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0111, + "step": 8420 + }, + { + "epoch": 4.717403469501958, + "grad_norm": 0.283252090215683, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0119, + "step": 8430 + }, + { + "epoch": 4.72299944040291, + "grad_norm": 0.20471790432929993, + "learning_rate": 6.670333090488356e-05, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 4.7285954113038615, + "grad_norm": 0.1850796490907669, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0112, + "step": 8450 + }, + { + "epoch": 4.734191382204813, + "grad_norm": 0.2515677213668823, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0084, + "step": 8460 + }, + { + "epoch": 4.739787353105764, + "grad_norm": 0.25231802463531494, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0149, + "step": 8470 + }, + { + "epoch": 4.745383324006715, + "grad_norm": 0.24691557884216309, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0144, + "step": 8480 + }, + { + "epoch": 4.750979294907666, + "grad_norm": 0.3806649446487427, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0136, + "step": 8490 + }, + { + "epoch": 4.756575265808618, + "grad_norm": 0.233370840549469, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0119, + "step": 8500 + }, + { + "epoch": 4.762171236709569, + "grad_norm": 0.16195163130760193, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0178, + "step": 8510 + }, + { + "epoch": 4.76776720761052, + "grad_norm": 0.25800469517707825, + "learning_rate": 6.607849694751977e-05, + "loss": 0.012, + "step": 8520 + }, + { + "epoch": 4.773363178511472, + "grad_norm": 0.17752796411514282, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 4.778959149412423, + "grad_norm": 0.2168557047843933, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0163, + "step": 8540 + }, + { + "epoch": 4.7845551203133745, + "grad_norm": 0.2908076345920563, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0091, + "step": 8550 + }, + { + "epoch": 4.790151091214326, + "grad_norm": 0.16817107796669006, + "learning_rate": 6.576501813961609e-05, + "loss": 0.012, + "step": 8560 + }, + { + "epoch": 4.795747062115277, + "grad_norm": 0.17607803642749786, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0095, + "step": 8570 + }, + { + "epoch": 4.801343033016228, + "grad_norm": 0.1395525336265564, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0127, + "step": 8580 + }, + { + "epoch": 4.806939003917179, + "grad_norm": 0.12721598148345947, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0127, + "step": 8590 + }, + { + "epoch": 4.812534974818131, + "grad_norm": 0.220106303691864, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0124, + "step": 8600 + }, + { + "epoch": 4.818130945719083, + "grad_norm": 0.1850575953722, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0133, + "step": 8610 + }, + { + "epoch": 4.823726916620034, + "grad_norm": 0.14641323685646057, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0083, + "step": 8620 + }, + { + "epoch": 4.829322887520985, + "grad_norm": 0.2565167546272278, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0146, + "step": 8630 + }, + { + "epoch": 4.834918858421936, + "grad_norm": 0.1807018518447876, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0093, + "step": 8640 + }, + { + "epoch": 4.8405148293228875, + "grad_norm": 0.22783279418945312, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0105, + "step": 8650 + }, + { + "epoch": 4.846110800223839, + "grad_norm": 0.18857407569885254, + "learning_rate": 6.497833413348909e-05, + "loss": 0.012, + "step": 8660 + }, + { + "epoch": 4.85170677112479, + "grad_norm": 0.31593799591064453, + "learning_rate": 6.489943695331923e-05, + "loss": 0.013, + "step": 8670 + }, + { + "epoch": 4.857302742025741, + "grad_norm": 0.3053518533706665, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0106, + "step": 8680 + }, + { + "epoch": 4.862898712926693, + "grad_norm": 0.2662791311740875, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0151, + "step": 8690 + }, + { + "epoch": 4.868494683827644, + "grad_norm": 0.13093920052051544, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0108, + "step": 8700 + }, + { + "epoch": 4.874090654728596, + "grad_norm": 0.17706599831581116, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0118, + "step": 8710 + }, + { + "epoch": 4.879686625629547, + "grad_norm": 0.19158832728862762, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0116, + "step": 8720 + }, + { + "epoch": 4.885282596530498, + "grad_norm": 0.12095298618078232, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0134, + "step": 8730 + }, + { + "epoch": 4.890878567431449, + "grad_norm": 0.2882150411605835, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0112, + "step": 8740 + }, + { + "epoch": 4.8964745383324, + "grad_norm": 0.34821435809135437, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0111, + "step": 8750 + }, + { + "epoch": 4.902070509233352, + "grad_norm": 0.28680020570755005, + "learning_rate": 6.418755520036775e-05, + "loss": 0.011, + "step": 8760 + }, + { + "epoch": 4.907666480134303, + "grad_norm": 0.15372464060783386, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0138, + "step": 8770 + }, + { + "epoch": 4.913262451035255, + "grad_norm": 0.24093207716941833, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0152, + "step": 8780 + }, + { + "epoch": 4.918858421936206, + "grad_norm": 0.3779686689376831, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0173, + "step": 8790 + }, + { + "epoch": 4.924454392837157, + "grad_norm": 0.19445843994617462, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0142, + "step": 8800 + }, + { + "epoch": 4.930050363738109, + "grad_norm": 0.32286763191223145, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0217, + "step": 8810 + }, + { + "epoch": 4.93564633463906, + "grad_norm": 0.27731436491012573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0103, + "step": 8820 + }, + { + "epoch": 4.941242305540011, + "grad_norm": 0.2174469232559204, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0112, + "step": 8830 + }, + { + "epoch": 4.946838276440962, + "grad_norm": 0.20424802601337433, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 4.952434247341914, + "grad_norm": 0.14288559556007385, + "learning_rate": 6.34725316577129e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 4.9580302182428655, + "grad_norm": 0.21734347939491272, + "learning_rate": 6.339289753131649e-05, + "loss": 0.012, + "step": 8860 + }, + { + "epoch": 4.963626189143817, + "grad_norm": 0.29445502161979675, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0116, + "step": 8870 + }, + { + "epoch": 4.969222160044768, + "grad_norm": 0.2319229543209076, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0194, + "step": 8880 + }, + { + "epoch": 4.974818130945719, + "grad_norm": 0.13166509568691254, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0127, + "step": 8890 + }, + { + "epoch": 4.98041410184667, + "grad_norm": 0.2546875774860382, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0115, + "step": 8900 + }, + { + "epoch": 4.9860100727476215, + "grad_norm": 0.2343253493309021, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 4.991606043648573, + "grad_norm": 0.12813247740268707, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0121, + "step": 8920 + }, + { + "epoch": 4.997202014549524, + "grad_norm": 0.11860624700784683, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 5.002797985450476, + "grad_norm": 0.1995118260383606, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0112, + "step": 8940 + }, + { + "epoch": 5.008393956351427, + "grad_norm": 0.2113560289144516, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0118, + "step": 8950 + }, + { + "epoch": 5.0139899272523785, + "grad_norm": 0.321319580078125, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0145, + "step": 8960 + }, + { + "epoch": 5.01958589815333, + "grad_norm": 0.15436704456806183, + "learning_rate": 6.251455347518073e-05, + "loss": 0.011, + "step": 8970 + }, + { + "epoch": 5.025181869054281, + "grad_norm": 0.2929522693157196, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0145, + "step": 8980 + }, + { + "epoch": 5.030777839955232, + "grad_norm": 0.2311781346797943, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0121, + "step": 8990 + }, + { + "epoch": 5.036373810856183, + "grad_norm": 0.16461458802223206, + "learning_rate": 6.227427435703997e-05, + "loss": 0.016, + "step": 9000 + }, + { + "epoch": 5.0419697817571345, + "grad_norm": 0.23925089836120605, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0125, + "step": 9010 + }, + { + "epoch": 5.047565752658087, + "grad_norm": 0.3376557230949402, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0147, + "step": 9020 + }, + { + "epoch": 5.053161723559038, + "grad_norm": 0.20988136529922485, + "learning_rate": 6.203369322287306e-05, + "loss": 0.0139, + "step": 9030 + }, + { + "epoch": 5.058757694459989, + "grad_norm": 0.17247657477855682, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0133, + "step": 9040 + }, + { + "epoch": 5.06435366536094, + "grad_norm": 0.24936120212078094, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0112, + "step": 9050 + }, + { + "epoch": 5.069949636261891, + "grad_norm": 0.1587497889995575, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0127, + "step": 9060 + }, + { + "epoch": 5.075545607162843, + "grad_norm": 0.12296043336391449, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0132, + "step": 9070 + }, + { + "epoch": 5.081141578063794, + "grad_norm": 0.2310076504945755, + "learning_rate": 6.163206960055651e-05, + "loss": 0.013, + "step": 9080 + }, + { + "epoch": 5.086737548964745, + "grad_norm": 0.1278199851512909, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0127, + "step": 9090 + }, + { + "epoch": 5.092333519865696, + "grad_norm": 0.225848987698555, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 5.097929490766648, + "grad_norm": 0.12778952717781067, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0102, + "step": 9110 + }, + { + "epoch": 5.1035254616676, + "grad_norm": 0.2868061065673828, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0102, + "step": 9120 + }, + { + "epoch": 5.109121432568551, + "grad_norm": 0.35349947214126587, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0151, + "step": 9130 + }, + { + "epoch": 5.114717403469502, + "grad_norm": 0.24252165853977203, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0111, + "step": 9140 + }, + { + "epoch": 5.120313374370453, + "grad_norm": 0.17868760228157043, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0102, + "step": 9150 + }, + { + "epoch": 5.125909345271404, + "grad_norm": 0.21379156410694122, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0137, + "step": 9160 + }, + { + "epoch": 5.131505316172356, + "grad_norm": 0.29363685846328735, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0131, + "step": 9170 + }, + { + "epoch": 5.137101287073307, + "grad_norm": 0.330162912607193, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0129, + "step": 9180 + }, + { + "epoch": 5.142697257974259, + "grad_norm": 0.2052110731601715, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0108, + "step": 9190 + }, + { + "epoch": 5.14829322887521, + "grad_norm": 0.17011559009552002, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 5.153889199776161, + "grad_norm": 0.3137349486351013, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0192, + "step": 9210 + }, + { + "epoch": 5.1594851706771125, + "grad_norm": 0.3046635389328003, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0119, + "step": 9220 + }, + { + "epoch": 5.165081141578064, + "grad_norm": 0.1919318437576294, + "learning_rate": 6.042253858615532e-05, + "loss": 0.0139, + "step": 9230 + }, + { + "epoch": 5.170677112479015, + "grad_norm": 0.3815397322177887, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0176, + "step": 9240 + }, + { + "epoch": 5.176273083379966, + "grad_norm": 0.23484662175178528, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0137, + "step": 9250 + }, + { + "epoch": 5.181869054280917, + "grad_norm": 0.1737549602985382, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0112, + "step": 9260 + }, + { + "epoch": 5.1874650251818695, + "grad_norm": 0.28736233711242676, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0112, + "step": 9270 + }, + { + "epoch": 5.193060996082821, + "grad_norm": 0.21343185007572174, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0089, + "step": 9280 + }, + { + "epoch": 5.198656966983772, + "grad_norm": 0.15162508189678192, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0156, + "step": 9290 + }, + { + "epoch": 5.204252937884723, + "grad_norm": 0.2816758155822754, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0093, + "step": 9300 + }, + { + "epoch": 5.209848908785674, + "grad_norm": 0.1730954796075821, + "learning_rate": 5.977478650257374e-05, + "loss": 0.016, + "step": 9310 + }, + { + "epoch": 5.2154448796866255, + "grad_norm": 0.18365302681922913, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0259, + "step": 9320 + }, + { + "epoch": 5.221040850587577, + "grad_norm": 0.12864327430725098, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0119, + "step": 9330 + }, + { + "epoch": 5.226636821488528, + "grad_norm": 0.16363385319709778, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0129, + "step": 9340 + }, + { + "epoch": 5.23223279238948, + "grad_norm": 0.15773551166057587, + "learning_rate": 5.945026200101702e-05, + "loss": 0.0083, + "step": 9350 + }, + { + "epoch": 5.237828763290431, + "grad_norm": 0.22605851292610168, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0096, + "step": 9360 + }, + { + "epoch": 5.243424734191382, + "grad_norm": 0.13637419044971466, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0185, + "step": 9370 + }, + { + "epoch": 5.249020705092334, + "grad_norm": 0.12795643508434296, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.008, + "step": 9380 + }, + { + "epoch": 5.254616675993285, + "grad_norm": 0.2635105550289154, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0162, + "step": 9390 + }, + { + "epoch": 5.260212646894236, + "grad_norm": 0.18397080898284912, + "learning_rate": 5.90440267166055e-05, + "loss": 0.013, + "step": 9400 + }, + { + "epoch": 5.265808617795187, + "grad_norm": 0.23337115347385406, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0105, + "step": 9410 + }, + { + "epoch": 5.2714045886961385, + "grad_norm": 0.24963605403900146, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0098, + "step": 9420 + }, + { + "epoch": 5.27700055959709, + "grad_norm": 0.372761070728302, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "epoch": 5.282596530498042, + "grad_norm": 0.2931661009788513, + "learning_rate": 5.871859208889759e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 5.288192501398993, + "grad_norm": 0.2341478168964386, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.0113, + "step": 9450 + }, + { + "epoch": 5.293788472299944, + "grad_norm": 0.2445063441991806, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0157, + "step": 9460 + }, + { + "epoch": 5.299384443200895, + "grad_norm": 0.22766774892807007, + "learning_rate": 5.847426505870399e-05, + "loss": 0.011, + "step": 9470 + }, + { + "epoch": 5.304980414101847, + "grad_norm": 0.25397437810897827, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0088, + "step": 9480 + }, + { + "epoch": 5.310576385002798, + "grad_norm": 0.2036605179309845, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0138, + "step": 9490 + }, + { + "epoch": 5.316172355903749, + "grad_norm": 0.17595243453979492, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0102, + "step": 9500 + }, + { + "epoch": 5.3217683268047, + "grad_norm": 0.14046894013881683, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0148, + "step": 9510 + }, + { + "epoch": 5.327364297705652, + "grad_norm": 0.2699585556983948, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0107, + "step": 9520 + }, + { + "epoch": 5.3329602686066035, + "grad_norm": 0.15614166855812073, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0118, + "step": 9530 + }, + { + "epoch": 5.338556239507555, + "grad_norm": 0.3686412572860718, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0122, + "step": 9540 + }, + { + "epoch": 5.344152210408506, + "grad_norm": 0.2578679323196411, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0152, + "step": 9550 + }, + { + "epoch": 5.349748181309457, + "grad_norm": 0.24605675041675568, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0106, + "step": 9560 + }, + { + "epoch": 5.355344152210408, + "grad_norm": 0.19138172268867493, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0134, + "step": 9570 + }, + { + "epoch": 5.36094012311136, + "grad_norm": 0.23657287657260895, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0076, + "step": 9580 + }, + { + "epoch": 5.366536094012311, + "grad_norm": 0.13402613997459412, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0092, + "step": 9590 + }, + { + "epoch": 5.372132064913263, + "grad_norm": 0.16487988829612732, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 5.377728035814214, + "grad_norm": 0.1842515617609024, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0101, + "step": 9610 + }, + { + "epoch": 5.383324006715165, + "grad_norm": 0.17961528897285461, + "learning_rate": 5.72496226034123e-05, + "loss": 0.012, + "step": 9620 + }, + { + "epoch": 5.3889199776161165, + "grad_norm": 0.2516380548477173, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.011, + "step": 9630 + }, + { + "epoch": 5.394515948517068, + "grad_norm": 0.1506935954093933, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0075, + "step": 9640 + }, + { + "epoch": 5.400111919418019, + "grad_norm": 0.3415573835372925, + "learning_rate": 5.70041348380039e-05, + "loss": 0.0142, + "step": 9650 + }, + { + "epoch": 5.40570789031897, + "grad_norm": 0.2501567006111145, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0126, + "step": 9660 + }, + { + "epoch": 5.411303861219921, + "grad_norm": 0.15769636631011963, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0206, + "step": 9670 + }, + { + "epoch": 5.416899832120873, + "grad_norm": 0.17793142795562744, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0198, + "step": 9680 + }, + { + "epoch": 5.422495803021825, + "grad_norm": 0.19135138392448425, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0089, + "step": 9690 + }, + { + "epoch": 5.428091773922776, + "grad_norm": 0.1910410374403, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "epoch": 5.433687744823727, + "grad_norm": 0.18896977603435516, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0102, + "step": 9710 + }, + { + "epoch": 5.439283715724678, + "grad_norm": 0.12857311964035034, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0147, + "step": 9720 + }, + { + "epoch": 5.4448796866256295, + "grad_norm": 0.20521825551986694, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0101, + "step": 9730 + }, + { + "epoch": 5.450475657526581, + "grad_norm": 0.16037105023860931, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0114, + "step": 9740 + }, + { + "epoch": 5.456071628427532, + "grad_norm": 0.15576882660388947, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0138, + "step": 9750 + }, + { + "epoch": 5.461667599328483, + "grad_norm": 0.24249835312366486, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0131, + "step": 9760 + }, + { + "epoch": 5.467263570229435, + "grad_norm": 0.27811625599861145, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0098, + "step": 9770 + }, + { + "epoch": 5.472859541130386, + "grad_norm": 0.3673328459262848, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0131, + "step": 9780 + }, + { + "epoch": 5.478455512031338, + "grad_norm": 0.2886298596858978, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0141, + "step": 9790 + }, + { + "epoch": 5.484051482932289, + "grad_norm": 0.2564665973186493, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0104, + "step": 9800 + }, + { + "epoch": 5.48964745383324, + "grad_norm": 0.22507299482822418, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0116, + "step": 9810 + }, + { + "epoch": 5.495243424734191, + "grad_norm": 0.09582646191120148, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0107, + "step": 9820 + }, + { + "epoch": 5.5008393956351425, + "grad_norm": 0.25511208176612854, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0111, + "step": 9830 + }, + { + "epoch": 5.506435366536094, + "grad_norm": 0.14793109893798828, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0088, + "step": 9840 + }, + { + "epoch": 5.512031337437046, + "grad_norm": 0.3215508759021759, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0132, + "step": 9850 + }, + { + "epoch": 5.517627308337997, + "grad_norm": 0.357731431722641, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0119, + "step": 9860 + }, + { + "epoch": 5.523223279238948, + "grad_norm": 0.2520214915275574, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0121, + "step": 9870 + }, + { + "epoch": 5.528819250139899, + "grad_norm": 0.28353017568588257, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 5.534415221040851, + "grad_norm": 0.34384286403656006, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0125, + "step": 9890 + }, + { + "epoch": 5.540011191941802, + "grad_norm": 0.21323193609714508, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0121, + "step": 9900 + }, + { + "epoch": 5.545607162842753, + "grad_norm": 0.3126833736896515, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0125, + "step": 9910 + }, + { + "epoch": 5.551203133743704, + "grad_norm": 0.29106199741363525, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0098, + "step": 9920 + }, + { + "epoch": 5.556799104644655, + "grad_norm": 0.2740892469882965, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0135, + "step": 9930 + }, + { + "epoch": 5.5623950755456075, + "grad_norm": 0.19003938138484955, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0146, + "step": 9940 + }, + { + "epoch": 5.567991046446559, + "grad_norm": 0.2251635491847992, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0153, + "step": 9950 + }, + { + "epoch": 5.57358701734751, + "grad_norm": 0.16961322724819183, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0154, + "step": 9960 + }, + { + "epoch": 5.579182988248461, + "grad_norm": 0.2752644419670105, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0102, + "step": 9970 + }, + { + "epoch": 5.584778959149412, + "grad_norm": 0.24675792455673218, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0123, + "step": 9980 + }, + { + "epoch": 5.590374930050364, + "grad_norm": 0.2074369490146637, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0095, + "step": 9990 + }, + { + "epoch": 5.595970900951315, + "grad_norm": 0.22929449379444122, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0123, + "step": 10000 + }, + { + "epoch": 5.601566871852266, + "grad_norm": 0.21107512712478638, + "learning_rate": 5.404657054794189e-05, + "loss": 0.01, + "step": 10010 + }, + { + "epoch": 5.607162842753217, + "grad_norm": 0.3743564188480377, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0173, + "step": 10020 + }, + { + "epoch": 5.612758813654169, + "grad_norm": 0.19637951254844666, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0114, + "step": 10030 + }, + { + "epoch": 5.6183547845551205, + "grad_norm": 0.2417994886636734, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0117, + "step": 10040 + }, + { + "epoch": 5.623950755456072, + "grad_norm": 0.18541017174720764, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0139, + "step": 10050 + }, + { + "epoch": 5.629546726357023, + "grad_norm": 0.26478803157806396, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0108, + "step": 10060 + }, + { + "epoch": 5.635142697257974, + "grad_norm": 0.23468017578125, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0107, + "step": 10070 + }, + { + "epoch": 5.640738668158925, + "grad_norm": 0.2251582145690918, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0105, + "step": 10080 + }, + { + "epoch": 5.6463346390598765, + "grad_norm": 0.18580631911754608, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0107, + "step": 10090 + }, + { + "epoch": 5.651930609960829, + "grad_norm": 0.14670825004577637, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0103, + "step": 10100 + }, + { + "epoch": 5.65752658086178, + "grad_norm": 0.22916555404663086, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0113, + "step": 10110 + }, + { + "epoch": 5.663122551762731, + "grad_norm": 0.1360463947057724, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0121, + "step": 10120 + }, + { + "epoch": 5.668718522663682, + "grad_norm": 0.24525059759616852, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0092, + "step": 10130 + }, + { + "epoch": 5.6743144935646335, + "grad_norm": 0.1451522707939148, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0081, + "step": 10140 + }, + { + "epoch": 5.679910464465585, + "grad_norm": 0.1923244744539261, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0117, + "step": 10150 + }, + { + "epoch": 5.685506435366536, + "grad_norm": 0.18804806470870972, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0095, + "step": 10160 + }, + { + "epoch": 5.691102406267487, + "grad_norm": 0.1880972534418106, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 5.696698377168438, + "grad_norm": 0.18024373054504395, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0119, + "step": 10180 + }, + { + "epoch": 5.70229434806939, + "grad_norm": 0.16494502127170563, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0131, + "step": 10190 + }, + { + "epoch": 5.707890318970342, + "grad_norm": 0.2051820605993271, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0136, + "step": 10200 + }, + { + "epoch": 5.713486289871293, + "grad_norm": 0.21385324001312256, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0101, + "step": 10210 + }, + { + "epoch": 5.719082260772244, + "grad_norm": 0.2170487344264984, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0107, + "step": 10220 + }, + { + "epoch": 5.724678231673195, + "grad_norm": 0.23433655500411987, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.0139, + "step": 10230 + }, + { + "epoch": 5.730274202574146, + "grad_norm": 0.2549709379673004, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0178, + "step": 10240 + }, + { + "epoch": 5.735870173475098, + "grad_norm": 0.11975869536399841, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0101, + "step": 10250 + }, + { + "epoch": 5.74146614437605, + "grad_norm": 0.39216071367263794, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0131, + "step": 10260 + }, + { + "epoch": 5.747062115277, + "grad_norm": 0.2390432357788086, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 5.752658086177952, + "grad_norm": 0.1686331033706665, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0104, + "step": 10280 + }, + { + "epoch": 5.758254057078903, + "grad_norm": 0.28812578320503235, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.01, + "step": 10290 + }, + { + "epoch": 5.763850027979855, + "grad_norm": 0.4722854197025299, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0125, + "step": 10300 + }, + { + "epoch": 5.769445998880806, + "grad_norm": 0.19151827692985535, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0134, + "step": 10310 + }, + { + "epoch": 5.775041969781757, + "grad_norm": 0.2533670961856842, + "learning_rate": 5.148790314815663e-05, + "loss": 0.011, + "step": 10320 + }, + { + "epoch": 5.780637940682708, + "grad_norm": 0.1756027489900589, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0092, + "step": 10330 + }, + { + "epoch": 5.786233911583659, + "grad_norm": 0.2753913700580597, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 5.7918298824846115, + "grad_norm": 0.17530974745750427, + "learning_rate": 5.123997522742151e-05, + "loss": 0.0092, + "step": 10350 + }, + { + "epoch": 5.797425853385563, + "grad_norm": 0.3250185251235962, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0135, + "step": 10360 + }, + { + "epoch": 5.803021824286514, + "grad_norm": 0.2266574651002884, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0174, + "step": 10370 + }, + { + "epoch": 5.808617795187465, + "grad_norm": 0.15442338585853577, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0112, + "step": 10380 + }, + { + "epoch": 5.814213766088416, + "grad_norm": 0.16227369010448456, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0141, + "step": 10390 + }, + { + "epoch": 5.8198097369893675, + "grad_norm": 0.288241982460022, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0142, + "step": 10400 + }, + { + "epoch": 5.825405707890319, + "grad_norm": 0.17878948152065277, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0115, + "step": 10410 + }, + { + "epoch": 5.83100167879127, + "grad_norm": 0.2224341630935669, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0106, + "step": 10420 + }, + { + "epoch": 5.836597649692221, + "grad_norm": 0.1762062907218933, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0115, + "step": 10430 + }, + { + "epoch": 5.842193620593173, + "grad_norm": 0.15165816247463226, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 10440 + }, + { + "epoch": 5.8477895914941245, + "grad_norm": 0.23350821435451508, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0124, + "step": 10450 + }, + { + "epoch": 5.853385562395076, + "grad_norm": 0.3131781816482544, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0136, + "step": 10460 + }, + { + "epoch": 5.858981533296027, + "grad_norm": 0.25165101885795593, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0081, + "step": 10470 + }, + { + "epoch": 5.864577504196978, + "grad_norm": 0.17109723389148712, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0104, + "step": 10480 + }, + { + "epoch": 5.870173475097929, + "grad_norm": 0.14172928035259247, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0096, + "step": 10490 + }, + { + "epoch": 5.8757694459988805, + "grad_norm": 0.15533624589443207, + "learning_rate": 5e-05, + "loss": 0.0075, + "step": 10500 + }, + { + "epoch": 5.881365416899833, + "grad_norm": 0.12869463860988617, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0114, + "step": 10510 + }, + { + "epoch": 5.886961387800784, + "grad_norm": 0.3376826345920563, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0135, + "step": 10520 + }, + { + "epoch": 5.892557358701735, + "grad_norm": 0.20675431191921234, + "learning_rate": 4.975198054446492e-05, + "loss": 0.0106, + "step": 10530 + }, + { + "epoch": 5.898153329602686, + "grad_norm": 0.14309728145599365, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0122, + "step": 10540 + }, + { + "epoch": 5.903749300503637, + "grad_norm": 0.13042593002319336, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0114, + "step": 10550 + }, + { + "epoch": 5.909345271404589, + "grad_norm": 0.14101748168468475, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "epoch": 5.91494124230554, + "grad_norm": 0.22409436106681824, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0109, + "step": 10570 + }, + { + "epoch": 5.920537213206491, + "grad_norm": 0.22155794501304626, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 5.926133184107442, + "grad_norm": 0.1839323341846466, + "learning_rate": 4.925596604392603e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "epoch": 5.931729155008394, + "grad_norm": 0.1160067617893219, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0103, + "step": 10600 + }, + { + "epoch": 5.937325125909346, + "grad_norm": 0.2413625419139862, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0117, + "step": 10610 + }, + { + "epoch": 5.942921096810297, + "grad_norm": 0.19037237763404846, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.011, + "step": 10620 + }, + { + "epoch": 5.948517067711248, + "grad_norm": 0.17303366959095, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0094, + "step": 10630 + }, + { + "epoch": 5.954113038612199, + "grad_norm": 0.2476578801870346, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0118, + "step": 10640 + }, + { + "epoch": 5.95970900951315, + "grad_norm": 0.29616495966911316, + "learning_rate": 4.87600247725785e-05, + "loss": 0.0118, + "step": 10650 + }, + { + "epoch": 5.965304980414102, + "grad_norm": 0.1653703898191452, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0093, + "step": 10660 + }, + { + "epoch": 5.970900951315053, + "grad_norm": 0.2089630663394928, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.0113, + "step": 10670 + }, + { + "epoch": 5.976496922216004, + "grad_norm": 0.14042207598686218, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0091, + "step": 10680 + }, + { + "epoch": 5.982092893116956, + "grad_norm": 0.17145408689975739, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0103, + "step": 10690 + }, + { + "epoch": 5.987688864017907, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0109, + "step": 10700 + }, + { + "epoch": 5.9932848349188586, + "grad_norm": 0.3018309473991394, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0125, + "step": 10710 + }, + { + "epoch": 5.99888080581981, + "grad_norm": 0.1233690157532692, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0093, + "step": 10720 + }, + { + "epoch": 6.004476776720761, + "grad_norm": 0.226378932595253, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0124, + "step": 10730 + }, + { + "epoch": 6.010072747621712, + "grad_norm": 0.149214506149292, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0105, + "step": 10740 + }, + { + "epoch": 6.015668718522663, + "grad_norm": 0.35911405086517334, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0097, + "step": 10750 + }, + { + "epoch": 6.021264689423615, + "grad_norm": 0.3472690284252167, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0115, + "step": 10760 + }, + { + "epoch": 6.026860660324567, + "grad_norm": 0.1740999072790146, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0088, + "step": 10770 + }, + { + "epoch": 6.032456631225518, + "grad_norm": 0.22089268267154694, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "epoch": 6.038052602126469, + "grad_norm": 0.17993643879890442, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0086, + "step": 10790 + }, + { + "epoch": 6.04364857302742, + "grad_norm": 0.3000619113445282, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0117, + "step": 10800 + }, + { + "epoch": 6.0492445439283715, + "grad_norm": 0.16510385274887085, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0098, + "step": 10810 + }, + { + "epoch": 6.054840514829323, + "grad_norm": 0.17736104130744934, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0111, + "step": 10820 + }, + { + "epoch": 6.060436485730274, + "grad_norm": 0.17262353003025055, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0075, + "step": 10830 + }, + { + "epoch": 6.066032456631225, + "grad_norm": 0.15096010267734528, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0086, + "step": 10840 + }, + { + "epoch": 6.071628427532177, + "grad_norm": 0.16276976466178894, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0102, + "step": 10850 + }, + { + "epoch": 6.0772243984331284, + "grad_norm": 0.42808446288108826, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0094, + "step": 10860 + }, + { + "epoch": 6.08282036933408, + "grad_norm": 0.17846183478832245, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0092, + "step": 10870 + }, + { + "epoch": 6.088416340235031, + "grad_norm": 0.2053506076335907, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0086, + "step": 10880 + }, + { + "epoch": 6.094012311135982, + "grad_norm": 0.2614595592021942, + "learning_rate": 4.677796799916845e-05, + "loss": 0.017, + "step": 10890 + }, + { + "epoch": 6.099608282036933, + "grad_norm": 0.2127176970243454, + "learning_rate": 4.669547078371504e-05, + "loss": 0.014, + "step": 10900 + }, + { + "epoch": 6.1052042529378845, + "grad_norm": 0.2204008847475052, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0116, + "step": 10910 + }, + { + "epoch": 6.110800223838836, + "grad_norm": 0.3794216215610504, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0112, + "step": 10920 + }, + { + "epoch": 6.116396194739787, + "grad_norm": 0.22125349938869476, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 6.121992165640739, + "grad_norm": 0.21079552173614502, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0118, + "step": 10940 + }, + { + "epoch": 6.12758813654169, + "grad_norm": 0.17766894400119781, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.007, + "step": 10950 + }, + { + "epoch": 6.133184107442641, + "grad_norm": 0.23495835065841675, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0092, + "step": 10960 + }, + { + "epoch": 6.138780078343593, + "grad_norm": 0.25509214401245117, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0098, + "step": 10970 + }, + { + "epoch": 6.144376049244544, + "grad_norm": 0.2415831834077835, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0105, + "step": 10980 + }, + { + "epoch": 6.149972020145495, + "grad_norm": 0.1638316661119461, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0092, + "step": 10990 + }, + { + "epoch": 6.155567991046446, + "grad_norm": 0.17809127271175385, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0089, + "step": 11000 + }, + { + "epoch": 6.1611639619473975, + "grad_norm": 0.22080188989639282, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.008, + "step": 11010 + }, + { + "epoch": 6.16675993284835, + "grad_norm": 0.19198036193847656, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0099, + "step": 11020 + }, + { + "epoch": 6.172355903749301, + "grad_norm": 0.1567138433456421, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0081, + "step": 11030 + }, + { + "epoch": 6.177951874650252, + "grad_norm": 0.10507390648126602, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0068, + "step": 11040 + }, + { + "epoch": 6.183547845551203, + "grad_norm": 0.2201065570116043, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0111, + "step": 11050 + }, + { + "epoch": 6.189143816452154, + "grad_norm": 0.2924385666847229, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0198, + "step": 11060 + }, + { + "epoch": 6.194739787353106, + "grad_norm": 0.18973895907402039, + "learning_rate": 4.529458433407429e-05, + "loss": 0.0113, + "step": 11070 + }, + { + "epoch": 6.200335758254057, + "grad_norm": 0.2131788432598114, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 6.205931729155008, + "grad_norm": 0.17389249801635742, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0066, + "step": 11090 + }, + { + "epoch": 6.21152770005596, + "grad_norm": 0.21684075891971588, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0071, + "step": 11100 + }, + { + "epoch": 6.217123670956911, + "grad_norm": 0.19866231083869934, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0096, + "step": 11110 + }, + { + "epoch": 6.2227196418578625, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0068, + "step": 11120 + }, + { + "epoch": 6.228315612758814, + "grad_norm": 0.12977780401706696, + "learning_rate": 4.480098449467132e-05, + "loss": 0.0089, + "step": 11130 + }, + { + "epoch": 6.233911583659765, + "grad_norm": 0.32740047574043274, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0092, + "step": 11140 + }, + { + "epoch": 6.239507554560716, + "grad_norm": 0.12163751572370529, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0089, + "step": 11150 + }, + { + "epoch": 6.245103525461667, + "grad_norm": 0.21914434432983398, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0084, + "step": 11160 + }, + { + "epoch": 6.250699496362619, + "grad_norm": 0.23196600377559662, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0095, + "step": 11170 + }, + { + "epoch": 6.25629546726357, + "grad_norm": 0.19451774656772614, + "learning_rate": 4.439004011435979e-05, + "loss": 0.01, + "step": 11180 + }, + { + "epoch": 6.261891438164522, + "grad_norm": 0.20714877545833588, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0124, + "step": 11190 + }, + { + "epoch": 6.267487409065473, + "grad_norm": 0.1735510528087616, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0089, + "step": 11200 + }, + { + "epoch": 6.273083379966424, + "grad_norm": 0.2282591164112091, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0166, + "step": 11210 + }, + { + "epoch": 6.2786793508673755, + "grad_norm": 0.2207183688879013, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0089, + "step": 11220 + }, + { + "epoch": 6.284275321768327, + "grad_norm": 0.252380907535553, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0111, + "step": 11230 + }, + { + "epoch": 6.289871292669278, + "grad_norm": 0.18762193620204926, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0099, + "step": 11240 + }, + { + "epoch": 6.295467263570229, + "grad_norm": 0.15788224339485168, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0086, + "step": 11250 + }, + { + "epoch": 6.301063234471181, + "grad_norm": 0.22205393016338348, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0081, + "step": 11260 + }, + { + "epoch": 6.306659205372132, + "grad_norm": 0.2042773962020874, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0112, + "step": 11270 + }, + { + "epoch": 6.312255176273084, + "grad_norm": 0.15884517133235931, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.011, + "step": 11280 + }, + { + "epoch": 6.317851147174035, + "grad_norm": 0.1604417860507965, + "learning_rate": 4.348735167125771e-05, + "loss": 0.0126, + "step": 11290 + }, + { + "epoch": 6.323447118074986, + "grad_norm": 0.1566859632730484, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0078, + "step": 11300 + }, + { + "epoch": 6.329043088975937, + "grad_norm": 0.2835988700389862, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0096, + "step": 11310 + }, + { + "epoch": 6.3346390598768885, + "grad_norm": 0.2758636772632599, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0118, + "step": 11320 + }, + { + "epoch": 6.34023503077784, + "grad_norm": 0.09336747974157333, + "learning_rate": 4.315961979031875e-05, + "loss": 0.0111, + "step": 11330 + }, + { + "epoch": 6.345831001678791, + "grad_norm": 0.16241887211799622, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0106, + "step": 11340 + }, + { + "epoch": 6.351426972579743, + "grad_norm": 0.20391559600830078, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0081, + "step": 11350 + }, + { + "epoch": 6.357022943480694, + "grad_norm": 0.12543804943561554, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0137, + "step": 11360 + }, + { + "epoch": 6.362618914381645, + "grad_norm": 0.24983376264572144, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0095, + "step": 11370 + }, + { + "epoch": 6.368214885282597, + "grad_norm": 0.2291889637708664, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0113, + "step": 11380 + }, + { + "epoch": 6.373810856183548, + "grad_norm": 0.1601787656545639, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.0128, + "step": 11390 + }, + { + "epoch": 6.379406827084499, + "grad_norm": 0.14628605544567108, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 6.38500279798545, + "grad_norm": 0.16742217540740967, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0104, + "step": 11410 + }, + { + "epoch": 6.390598768886401, + "grad_norm": 0.20203527808189392, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0112, + "step": 11420 + }, + { + "epoch": 6.396194739787353, + "grad_norm": 0.2605644762516022, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0166, + "step": 11430 + }, + { + "epoch": 6.401790710688305, + "grad_norm": 0.22104188799858093, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0069, + "step": 11440 + }, + { + "epoch": 6.407386681589256, + "grad_norm": 0.2080865204334259, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0136, + "step": 11450 + }, + { + "epoch": 6.412982652490207, + "grad_norm": 0.22961939871311188, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0184, + "step": 11460 + }, + { + "epoch": 6.418578623391158, + "grad_norm": 0.3134923577308655, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0108, + "step": 11470 + }, + { + "epoch": 6.42417459429211, + "grad_norm": 0.11267667263746262, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0113, + "step": 11480 + }, + { + "epoch": 6.429770565193061, + "grad_norm": 0.11718063056468964, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0069, + "step": 11490 + }, + { + "epoch": 6.435366536094012, + "grad_norm": 0.15294240415096283, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0087, + "step": 11500 + }, + { + "epoch": 6.440962506994964, + "grad_norm": 0.16003765165805817, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0087, + "step": 11510 + }, + { + "epoch": 6.446558477895915, + "grad_norm": 0.28345319628715515, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0119, + "step": 11520 + }, + { + "epoch": 6.4521544487968665, + "grad_norm": 0.18619926273822784, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.01, + "step": 11530 + }, + { + "epoch": 6.457750419697818, + "grad_norm": 0.1567833423614502, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0089, + "step": 11540 + }, + { + "epoch": 6.463346390598769, + "grad_norm": 0.16688846051692963, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0078, + "step": 11550 + }, + { + "epoch": 6.46894236149972, + "grad_norm": 0.19638372957706451, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 6.474538332400671, + "grad_norm": 0.13919275999069214, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0105, + "step": 11570 + }, + { + "epoch": 6.4801343033016225, + "grad_norm": 0.17611968517303467, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0145, + "step": 11580 + }, + { + "epoch": 6.485730274202574, + "grad_norm": 0.15707933902740479, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0088, + "step": 11590 + }, + { + "epoch": 6.491326245103526, + "grad_norm": 0.16832014918327332, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0087, + "step": 11600 + }, + { + "epoch": 6.496922216004477, + "grad_norm": 0.16573460400104523, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0085, + "step": 11610 + }, + { + "epoch": 6.502518186905428, + "grad_norm": 0.16878801584243774, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0081, + "step": 11620 + }, + { + "epoch": 6.5081141578063795, + "grad_norm": 0.10650831460952759, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0088, + "step": 11630 + }, + { + "epoch": 6.513710128707331, + "grad_norm": 0.15549488365650177, + "learning_rate": 4.063093427071376e-05, + "loss": 0.008, + "step": 11640 + }, + { + "epoch": 6.519306099608282, + "grad_norm": 0.17358443140983582, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0133, + "step": 11650 + }, + { + "epoch": 6.524902070509233, + "grad_norm": 0.24347983300685883, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0123, + "step": 11660 + }, + { + "epoch": 6.530498041410184, + "grad_norm": 0.31662797927856445, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0108, + "step": 11670 + }, + { + "epoch": 6.5360940123111355, + "grad_norm": 0.21490415930747986, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0116, + "step": 11680 + }, + { + "epoch": 6.541689983212088, + "grad_norm": 0.10896732658147812, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.0088, + "step": 11690 + }, + { + "epoch": 6.547285954113039, + "grad_norm": 0.22287431359291077, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0157, + "step": 11700 + }, + { + "epoch": 6.55288192501399, + "grad_norm": 0.2492447942495346, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0155, + "step": 11710 + }, + { + "epoch": 6.558477895914941, + "grad_norm": 0.09591550379991531, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0076, + "step": 11720 + }, + { + "epoch": 6.564073866815892, + "grad_norm": 0.21364928781986237, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0109, + "step": 11730 + }, + { + "epoch": 6.569669837716844, + "grad_norm": 0.2347889095544815, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 6.575265808617795, + "grad_norm": 0.07959645986557007, + "learning_rate": 3.973923289021829e-05, + "loss": 0.007, + "step": 11750 + }, + { + "epoch": 6.580861779518747, + "grad_norm": 0.18356555700302124, + "learning_rate": 3.965833301517017e-05, + "loss": 0.014, + "step": 11760 + }, + { + "epoch": 6.586457750419698, + "grad_norm": 0.16104575991630554, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.0159, + "step": 11770 + }, + { + "epoch": 6.592053721320649, + "grad_norm": 0.2652454972267151, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0103, + "step": 11780 + }, + { + "epoch": 6.597649692221601, + "grad_norm": 0.29040461778640747, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0077, + "step": 11790 + }, + { + "epoch": 6.603245663122552, + "grad_norm": 0.3047587275505066, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0137, + "step": 11800 + }, + { + "epoch": 6.608841634023503, + "grad_norm": 0.15864235162734985, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0097, + "step": 11810 + }, + { + "epoch": 6.614437604924454, + "grad_norm": 0.20918135344982147, + "learning_rate": 3.917353524881302e-05, + "loss": 0.008, + "step": 11820 + }, + { + "epoch": 6.620033575825405, + "grad_norm": 0.17880207300186157, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0119, + "step": 11830 + }, + { + "epoch": 6.625629546726357, + "grad_norm": 0.16844668984413147, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0092, + "step": 11840 + }, + { + "epoch": 6.631225517627309, + "grad_norm": 0.2069406360387802, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0116, + "step": 11850 + }, + { + "epoch": 6.63682148852826, + "grad_norm": 0.2709522843360901, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0076, + "step": 11860 + }, + { + "epoch": 6.642417459429211, + "grad_norm": 0.16224393248558044, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0099, + "step": 11870 + }, + { + "epoch": 6.648013430330162, + "grad_norm": 0.238708034157753, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 6.6536094012311136, + "grad_norm": 0.12267536669969559, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0086, + "step": 11890 + }, + { + "epoch": 6.659205372132065, + "grad_norm": 0.1931445449590683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0098, + "step": 11900 + }, + { + "epoch": 6.664801343033016, + "grad_norm": 0.23762571811676025, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0091, + "step": 11910 + }, + { + "epoch": 6.670397313933967, + "grad_norm": 0.1977052241563797, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0079, + "step": 11920 + }, + { + "epoch": 6.675993284834918, + "grad_norm": 0.10921810567378998, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0072, + "step": 11930 + }, + { + "epoch": 6.6815892557358705, + "grad_norm": 0.2423611879348755, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0119, + "step": 11940 + }, + { + "epoch": 6.687185226636822, + "grad_norm": 0.19429948925971985, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0081, + "step": 11950 + }, + { + "epoch": 6.692781197537773, + "grad_norm": 0.104859858751297, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0064, + "step": 11960 + }, + { + "epoch": 6.698377168438724, + "grad_norm": 0.11694277077913284, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0086, + "step": 11970 + }, + { + "epoch": 6.703973139339675, + "grad_norm": 0.2368919551372528, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0059, + "step": 11980 + }, + { + "epoch": 6.7095691102406265, + "grad_norm": 0.20411504805088043, + "learning_rate": 3.780588608918947e-05, + "loss": 0.0133, + "step": 11990 + }, + { + "epoch": 6.715165081141578, + "grad_norm": 0.11036452651023865, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0085, + "step": 12000 + }, + { + "epoch": 6.72076105204253, + "grad_norm": 0.09863012284040451, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0064, + "step": 12010 + }, + { + "epoch": 6.726357022943481, + "grad_norm": 0.12064427882432938, + "learning_rate": 3.756550564175727e-05, + "loss": 0.009, + "step": 12020 + }, + { + "epoch": 6.731952993844432, + "grad_norm": 0.11138517409563065, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0082, + "step": 12030 + }, + { + "epoch": 6.7375489647453835, + "grad_norm": 0.1209891140460968, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0074, + "step": 12040 + }, + { + "epoch": 6.743144935646335, + "grad_norm": 0.22739742696285248, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 6.748740906547286, + "grad_norm": 0.19938482344150543, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 6.754336877448237, + "grad_norm": 0.18825367093086243, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0091, + "step": 12070 + }, + { + "epoch": 6.759932848349188, + "grad_norm": 0.18540059030056, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0096, + "step": 12080 + }, + { + "epoch": 6.7655288192501395, + "grad_norm": 0.11188949644565582, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0081, + "step": 12090 + }, + { + "epoch": 6.771124790151092, + "grad_norm": 0.09911153465509415, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 6.776720761052043, + "grad_norm": 0.2001970112323761, + "learning_rate": 3.684622367052887e-05, + "loss": 0.007, + "step": 12110 + }, + { + "epoch": 6.782316731952994, + "grad_norm": 0.256001740694046, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0101, + "step": 12120 + }, + { + "epoch": 6.787912702853945, + "grad_norm": 0.16810284554958344, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 6.793508673754896, + "grad_norm": 0.21629579365253448, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0066, + "step": 12140 + }, + { + "epoch": 6.799104644655848, + "grad_norm": 0.2616669237613678, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0111, + "step": 12150 + }, + { + "epoch": 6.804700615556799, + "grad_norm": 0.18898139894008636, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0103, + "step": 12160 + }, + { + "epoch": 6.81029658645775, + "grad_norm": 0.20177505910396576, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0064, + "step": 12170 + }, + { + "epoch": 6.815892557358701, + "grad_norm": 0.18514911830425262, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 12180 + }, + { + "epoch": 6.821488528259653, + "grad_norm": 0.25894469022750854, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0083, + "step": 12190 + }, + { + "epoch": 6.827084499160605, + "grad_norm": 0.16605038940906525, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0049, + "step": 12200 + }, + { + "epoch": 6.832680470061556, + "grad_norm": 0.17524683475494385, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0066, + "step": 12210 + }, + { + "epoch": 6.838276440962507, + "grad_norm": 0.10738332569599152, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0087, + "step": 12220 + }, + { + "epoch": 6.843872411863458, + "grad_norm": 0.19934684038162231, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 6.849468382764409, + "grad_norm": 0.12639135122299194, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0092, + "step": 12240 + }, + { + "epoch": 6.855064353665361, + "grad_norm": 0.1152096539735794, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.007, + "step": 12250 + }, + { + "epoch": 6.860660324566313, + "grad_norm": 0.2878243625164032, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0096, + "step": 12260 + }, + { + "epoch": 6.866256295467264, + "grad_norm": 0.2725951075553894, + "learning_rate": 3.557479399469721e-05, + "loss": 0.0081, + "step": 12270 + }, + { + "epoch": 6.871852266368215, + "grad_norm": 0.16931770741939545, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0085, + "step": 12280 + }, + { + "epoch": 6.877448237269166, + "grad_norm": 0.11503436416387558, + "learning_rate": 3.541655695359142e-05, + "loss": 0.0062, + "step": 12290 + }, + { + "epoch": 6.8830442081701175, + "grad_norm": 0.18025194108486176, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0082, + "step": 12300 + }, + { + "epoch": 6.888640179071069, + "grad_norm": 0.1392613649368286, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0086, + "step": 12310 + }, + { + "epoch": 6.89423614997202, + "grad_norm": 0.2620909512042999, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0108, + "step": 12320 + }, + { + "epoch": 6.899832120872971, + "grad_norm": 0.12296637147665024, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.008, + "step": 12330 + }, + { + "epoch": 6.905428091773922, + "grad_norm": 0.13329119980335236, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0104, + "step": 12340 + }, + { + "epoch": 6.9110240626748745, + "grad_norm": 0.18710525333881378, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0096, + "step": 12350 + }, + { + "epoch": 6.916620033575826, + "grad_norm": 0.199269637465477, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0098, + "step": 12360 + }, + { + "epoch": 6.922216004476777, + "grad_norm": 0.24953125417232513, + "learning_rate": 3.478522088940993e-05, + "loss": 0.01, + "step": 12370 + }, + { + "epoch": 6.927811975377728, + "grad_norm": 0.1573137789964676, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0119, + "step": 12380 + }, + { + "epoch": 6.933407946278679, + "grad_norm": 0.24244867265224457, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0129, + "step": 12390 + }, + { + "epoch": 6.9390039171796305, + "grad_norm": 0.12841010093688965, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0074, + "step": 12400 + }, + { + "epoch": 6.944599888080582, + "grad_norm": 0.17973212897777557, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0084, + "step": 12410 + }, + { + "epoch": 6.950195858981533, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0096, + "step": 12420 + }, + { + "epoch": 6.955791829882484, + "grad_norm": 0.21580283343791962, + "learning_rate": 3.431345985070067e-05, + "loss": 0.009, + "step": 12430 + }, + { + "epoch": 6.961387800783436, + "grad_norm": 0.22562581300735474, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0105, + "step": 12440 + }, + { + "epoch": 6.966983771684387, + "grad_norm": 0.19070309400558472, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 6.972579742585339, + "grad_norm": 0.2400059998035431, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0102, + "step": 12460 + }, + { + "epoch": 6.97817571348629, + "grad_norm": 0.13252539932727814, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0066, + "step": 12470 + }, + { + "epoch": 6.983771684387241, + "grad_norm": 0.2826622426509857, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0103, + "step": 12480 + }, + { + "epoch": 6.989367655288192, + "grad_norm": 0.2674136757850647, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0089, + "step": 12490 + }, + { + "epoch": 6.9949636261891435, + "grad_norm": 0.09753147512674332, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0126, + "step": 12500 + }, + { + "epoch": 7.000559597090096, + "grad_norm": 0.13642564415931702, + "learning_rate": 3.368685475518488e-05, + "loss": 0.01, + "step": 12510 + }, + { + "epoch": 7.006155567991047, + "grad_norm": 0.2658902704715729, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0168, + "step": 12520 + }, + { + "epoch": 7.011751538891998, + "grad_norm": 0.12951083481311798, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0081, + "step": 12530 + }, + { + "epoch": 7.017347509792949, + "grad_norm": 0.23773689568042755, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0082, + "step": 12540 + }, + { + "epoch": 7.0229434806939, + "grad_norm": 0.21580462157726288, + "learning_rate": 3.337461574737716e-05, + "loss": 0.0106, + "step": 12550 + }, + { + "epoch": 7.028539451594852, + "grad_norm": 0.15399706363677979, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0103, + "step": 12560 + }, + { + "epoch": 7.034135422495803, + "grad_norm": 0.21200086176395416, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0141, + "step": 12570 + }, + { + "epoch": 7.039731393396754, + "grad_norm": 0.2530173063278198, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0101, + "step": 12580 + }, + { + "epoch": 7.045327364297705, + "grad_norm": 0.16888059675693512, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0068, + "step": 12590 + }, + { + "epoch": 7.050923335198657, + "grad_norm": 0.213544562458992, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0099, + "step": 12600 + }, + { + "epoch": 7.0565193060996085, + "grad_norm": 0.2427508383989334, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "epoch": 7.06211527700056, + "grad_norm": 0.3301132023334503, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 7.067711247901511, + "grad_norm": 0.15243375301361084, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0058, + "step": 12630 + }, + { + "epoch": 7.073307218802462, + "grad_norm": 0.14671820402145386, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0062, + "step": 12640 + }, + { + "epoch": 7.078903189703413, + "grad_norm": 0.22168104350566864, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0079, + "step": 12650 + }, + { + "epoch": 7.084499160604365, + "grad_norm": 0.25640955567359924, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0085, + "step": 12660 + }, + { + "epoch": 7.090095131505316, + "grad_norm": 0.2436077892780304, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0078, + "step": 12670 + }, + { + "epoch": 7.095691102406268, + "grad_norm": 0.19463610649108887, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0066, + "step": 12680 + }, + { + "epoch": 7.101287073307219, + "grad_norm": 0.22004422545433044, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0078, + "step": 12690 + }, + { + "epoch": 7.10688304420817, + "grad_norm": 0.1793327033519745, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0059, + "step": 12700 + }, + { + "epoch": 7.1124790151091215, + "grad_norm": 0.2823750376701355, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0072, + "step": 12710 + }, + { + "epoch": 7.118074986010073, + "grad_norm": 0.3006185293197632, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0099, + "step": 12720 + }, + { + "epoch": 7.123670956911024, + "grad_norm": 0.15955254435539246, + "learning_rate": 3.197873634733096e-05, + "loss": 0.01, + "step": 12730 + }, + { + "epoch": 7.129266927811975, + "grad_norm": 0.3392355442047119, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0096, + "step": 12740 + }, + { + "epoch": 7.134862898712926, + "grad_norm": 0.209779292345047, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0083, + "step": 12750 + }, + { + "epoch": 7.140458869613878, + "grad_norm": 0.13492996990680695, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0085, + "step": 12760 + }, + { + "epoch": 7.14605484051483, + "grad_norm": 0.20543181896209717, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0099, + "step": 12770 + }, + { + "epoch": 7.151650811415781, + "grad_norm": 0.24595800042152405, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0089, + "step": 12780 + }, + { + "epoch": 7.157246782316732, + "grad_norm": 0.24962860345840454, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0111, + "step": 12790 + }, + { + "epoch": 7.162842753217683, + "grad_norm": 0.236158549785614, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0092, + "step": 12800 + }, + { + "epoch": 7.1684387241186345, + "grad_norm": 0.09373817592859268, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0073, + "step": 12810 + }, + { + "epoch": 7.174034695019586, + "grad_norm": 0.3018852770328522, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0109, + "step": 12820 + }, + { + "epoch": 7.179630665920537, + "grad_norm": 0.22144253551959991, + "learning_rate": 3.121006642541014e-05, + "loss": 0.008, + "step": 12830 + }, + { + "epoch": 7.185226636821488, + "grad_norm": 0.14473740756511688, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0095, + "step": 12840 + }, + { + "epoch": 7.19082260772244, + "grad_norm": 0.14747409522533417, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0062, + "step": 12850 + }, + { + "epoch": 7.196418578623391, + "grad_norm": 0.2111588716506958, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 12860 + }, + { + "epoch": 7.202014549524343, + "grad_norm": 0.2098371833562851, + "learning_rate": 3.090402508946249e-05, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 7.207610520425294, + "grad_norm": 0.1614372432231903, + "learning_rate": 3.082764475205442e-05, + "loss": 0.007, + "step": 12880 + }, + { + "epoch": 7.213206491326245, + "grad_norm": 0.0742206946015358, + "learning_rate": 3.075131683113352e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 7.218802462227196, + "grad_norm": 0.07135152816772461, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0057, + "step": 12900 + }, + { + "epoch": 7.2243984331281474, + "grad_norm": 0.20988823473453522, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0071, + "step": 12910 + }, + { + "epoch": 7.229994404029099, + "grad_norm": 0.10817866027355194, + "learning_rate": 3.052264965335e-05, + "loss": 0.0049, + "step": 12920 + }, + { + "epoch": 7.235590374930051, + "grad_norm": 0.13764233887195587, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0088, + "step": 12930 + }, + { + "epoch": 7.241186345831002, + "grad_norm": 0.17063380777835846, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0073, + "step": 12940 + }, + { + "epoch": 7.246782316731953, + "grad_norm": 0.11198591440916061, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0112, + "step": 12950 + }, + { + "epoch": 7.252378287632904, + "grad_norm": 0.1855844408273697, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 12960 + }, + { + "epoch": 7.257974258533856, + "grad_norm": 0.10013962537050247, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0063, + "step": 12970 + }, + { + "epoch": 7.263570229434807, + "grad_norm": 0.16480940580368042, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0062, + "step": 12980 + }, + { + "epoch": 7.269166200335758, + "grad_norm": 0.2087039351463318, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0085, + "step": 12990 + }, + { + "epoch": 7.274762171236709, + "grad_norm": 0.15215320885181427, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0077, + "step": 13000 + }, + { + "epoch": 7.280358142137661, + "grad_norm": 0.2687567472457886, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0105, + "step": 13010 + }, + { + "epoch": 7.2859541130386125, + "grad_norm": 0.23126524686813354, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0076, + "step": 13020 + }, + { + "epoch": 7.291550083939564, + "grad_norm": 0.10021013021469116, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.01, + "step": 13030 + }, + { + "epoch": 7.297146054840515, + "grad_norm": 0.16525714099407196, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0082, + "step": 13040 + }, + { + "epoch": 7.302742025741466, + "grad_norm": 0.16742092370986938, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0141, + "step": 13050 + }, + { + "epoch": 7.308337996642417, + "grad_norm": 0.07409677654504776, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0083, + "step": 13060 + }, + { + "epoch": 7.3139339675433686, + "grad_norm": 0.2794577181339264, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0091, + "step": 13070 + }, + { + "epoch": 7.31952993844432, + "grad_norm": 0.16768626868724823, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0128, + "step": 13080 + }, + { + "epoch": 7.325125909345271, + "grad_norm": 0.19670413434505463, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0058, + "step": 13090 + }, + { + "epoch": 7.330721880246223, + "grad_norm": 0.1663038730621338, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0081, + "step": 13100 + }, + { + "epoch": 7.336317851147174, + "grad_norm": 0.2468092292547226, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0094, + "step": 13110 + }, + { + "epoch": 7.3419138220481255, + "grad_norm": 0.20476868748664856, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0095, + "step": 13120 + }, + { + "epoch": 7.347509792949077, + "grad_norm": 0.19373807311058044, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0077, + "step": 13130 + }, + { + "epoch": 7.353105763850028, + "grad_norm": 0.16226400434970856, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0105, + "step": 13140 + }, + { + "epoch": 7.358701734750979, + "grad_norm": 0.2760455906391144, + "learning_rate": 2.878586970034232e-05, + "loss": 0.017, + "step": 13150 + }, + { + "epoch": 7.36429770565193, + "grad_norm": 0.269136518239975, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0127, + "step": 13160 + }, + { + "epoch": 7.3698936765528815, + "grad_norm": 0.2237207144498825, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.007, + "step": 13170 + }, + { + "epoch": 7.375489647453834, + "grad_norm": 0.1836055964231491, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0077, + "step": 13180 + }, + { + "epoch": 7.381085618354785, + "grad_norm": 0.1962578445672989, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0078, + "step": 13190 + }, + { + "epoch": 7.386681589255736, + "grad_norm": 0.16476459801197052, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0093, + "step": 13200 + }, + { + "epoch": 7.392277560156687, + "grad_norm": 0.17988111078739166, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0081, + "step": 13210 + }, + { + "epoch": 7.3978735310576385, + "grad_norm": 0.21751411259174347, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0092, + "step": 13220 + }, + { + "epoch": 7.40346950195859, + "grad_norm": 0.150657057762146, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0094, + "step": 13230 + }, + { + "epoch": 7.409065472859541, + "grad_norm": 0.16722621023654938, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 7.414661443760492, + "grad_norm": 0.16167713701725006, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0063, + "step": 13250 + }, + { + "epoch": 7.420257414661444, + "grad_norm": 0.07585649192333221, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0112, + "step": 13260 + }, + { + "epoch": 7.425853385562395, + "grad_norm": 0.19306915998458862, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0069, + "step": 13270 + }, + { + "epoch": 7.431449356463347, + "grad_norm": 0.23972170054912567, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0062, + "step": 13280 + }, + { + "epoch": 7.437045327364298, + "grad_norm": 0.15592247247695923, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 7.442641298265249, + "grad_norm": 0.20682460069656372, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0052, + "step": 13300 + }, + { + "epoch": 7.4482372691662, + "grad_norm": 0.09251468628644943, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0077, + "step": 13310 + }, + { + "epoch": 7.453833240067151, + "grad_norm": 0.1358599066734314, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0098, + "step": 13320 + }, + { + "epoch": 7.459429210968103, + "grad_norm": 0.10552109777927399, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0045, + "step": 13330 + }, + { + "epoch": 7.465025181869054, + "grad_norm": 0.22331656515598297, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0075, + "step": 13340 + }, + { + "epoch": 7.470621152770006, + "grad_norm": 0.25520750880241394, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 7.476217123670957, + "grad_norm": 0.3160042464733124, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0123, + "step": 13360 + }, + { + "epoch": 7.481813094571908, + "grad_norm": 0.1619534194469452, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 7.48740906547286, + "grad_norm": 0.3031173646450043, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0098, + "step": 13380 + }, + { + "epoch": 7.493005036373811, + "grad_norm": 0.1819227635860443, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0065, + "step": 13390 + }, + { + "epoch": 7.498601007274762, + "grad_norm": 0.17307765781879425, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0087, + "step": 13400 + }, + { + "epoch": 7.504196978175713, + "grad_norm": 0.1600845456123352, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0061, + "step": 13410 + }, + { + "epoch": 7.509792949076665, + "grad_norm": 0.21853172779083252, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0054, + "step": 13420 + }, + { + "epoch": 7.5153889199776165, + "grad_norm": 0.16434265673160553, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0104, + "step": 13430 + }, + { + "epoch": 7.520984890878568, + "grad_norm": 0.1688595563173294, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0068, + "step": 13440 + }, + { + "epoch": 7.526580861779519, + "grad_norm": 0.10968342423439026, + "learning_rate": 2.656698529189193e-05, + "loss": 0.0072, + "step": 13450 + }, + { + "epoch": 7.53217683268047, + "grad_norm": 0.12489527463912964, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0067, + "step": 13460 + }, + { + "epoch": 7.537772803581421, + "grad_norm": 0.3275364935398102, + "learning_rate": 2.642104972349403e-05, + "loss": 0.0066, + "step": 13470 + }, + { + "epoch": 7.5433687744823725, + "grad_norm": 0.10653702169656754, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0133, + "step": 13480 + }, + { + "epoch": 7.548964745383324, + "grad_norm": 0.16446645557880402, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0075, + "step": 13490 + }, + { + "epoch": 7.554560716284275, + "grad_norm": 0.17610448598861694, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0077, + "step": 13500 + }, + { + "epoch": 7.560156687185227, + "grad_norm": 0.1589246541261673, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0044, + "step": 13510 + }, + { + "epoch": 7.565752658086178, + "grad_norm": 0.3019932806491852, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0067, + "step": 13520 + }, + { + "epoch": 7.5713486289871295, + "grad_norm": 0.19549022614955902, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0083, + "step": 13530 + }, + { + "epoch": 7.576944599888081, + "grad_norm": 0.1532057523727417, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0043, + "step": 13540 + }, + { + "epoch": 7.582540570789032, + "grad_norm": 0.1547580510377884, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0083, + "step": 13550 + }, + { + "epoch": 7.588136541689983, + "grad_norm": 0.30122992396354675, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0064, + "step": 13560 + }, + { + "epoch": 7.593732512590934, + "grad_norm": 0.12354984134435654, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0059, + "step": 13570 + }, + { + "epoch": 7.5993284834918855, + "grad_norm": 0.14805443584918976, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 7.604924454392837, + "grad_norm": 0.15644380450248718, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0076, + "step": 13590 + }, + { + "epoch": 7.610520425293789, + "grad_norm": 0.22504927217960358, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0114, + "step": 13600 + }, + { + "epoch": 7.61611639619474, + "grad_norm": 0.10872774571180344, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0098, + "step": 13610 + }, + { + "epoch": 7.621712367095691, + "grad_norm": 0.1415761411190033, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0059, + "step": 13620 + }, + { + "epoch": 7.627308337996642, + "grad_norm": 0.18380744755268097, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0074, + "step": 13630 + }, + { + "epoch": 7.632904308897594, + "grad_norm": 0.13029605150222778, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0046, + "step": 13640 + }, + { + "epoch": 7.638500279798545, + "grad_norm": 0.11022605746984482, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0052, + "step": 13650 + }, + { + "epoch": 7.644096250699496, + "grad_norm": 0.1933964192867279, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0062, + "step": 13660 + }, + { + "epoch": 7.649692221600448, + "grad_norm": 0.10140606015920639, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.005, + "step": 13670 + }, + { + "epoch": 7.655288192501399, + "grad_norm": 0.1074545681476593, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "epoch": 7.660884163402351, + "grad_norm": 0.11866219341754913, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0063, + "step": 13690 + }, + { + "epoch": 7.666480134303302, + "grad_norm": 0.14528554677963257, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0075, + "step": 13700 + }, + { + "epoch": 7.672076105204253, + "grad_norm": 0.12533891201019287, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0056, + "step": 13710 + }, + { + "epoch": 7.677672076105204, + "grad_norm": 0.2228127419948578, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.007, + "step": 13720 + }, + { + "epoch": 7.683268047006155, + "grad_norm": 0.167043074965477, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0092, + "step": 13730 + }, + { + "epoch": 7.688864017907107, + "grad_norm": 0.1956396847963333, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0104, + "step": 13740 + }, + { + "epoch": 7.694459988808058, + "grad_norm": 0.3440028429031372, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0101, + "step": 13750 + }, + { + "epoch": 7.70005595970901, + "grad_norm": 0.19462288916110992, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0123, + "step": 13760 + }, + { + "epoch": 7.705651930609961, + "grad_norm": 0.2774219512939453, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.012, + "step": 13770 + }, + { + "epoch": 7.711247901510912, + "grad_norm": 0.15917648375034332, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0072, + "step": 13780 + }, + { + "epoch": 7.7168438724118635, + "grad_norm": 0.17087779939174652, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0051, + "step": 13790 + }, + { + "epoch": 7.722439843312815, + "grad_norm": 0.3049764931201935, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0068, + "step": 13800 + }, + { + "epoch": 7.728035814213766, + "grad_norm": 0.23013077676296234, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0093, + "step": 13810 + }, + { + "epoch": 7.733631785114717, + "grad_norm": 0.25196191668510437, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0063, + "step": 13820 + }, + { + "epoch": 7.739227756015668, + "grad_norm": 0.13192011415958405, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0077, + "step": 13830 + }, + { + "epoch": 7.74482372691662, + "grad_norm": 0.14442972838878632, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 7.750419697817572, + "grad_norm": 0.1425463706254959, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0066, + "step": 13850 + }, + { + "epoch": 7.756015668718523, + "grad_norm": 0.1162482276558876, + "learning_rate": 2.362879302963135e-05, + "loss": 0.007, + "step": 13860 + }, + { + "epoch": 7.761611639619474, + "grad_norm": 0.21869398653507233, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 7.767207610520425, + "grad_norm": 0.1804109364748001, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0064, + "step": 13880 + }, + { + "epoch": 7.7728035814213765, + "grad_norm": 0.18711616098880768, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0082, + "step": 13890 + }, + { + "epoch": 7.778399552322328, + "grad_norm": 0.17115071415901184, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.008, + "step": 13900 + }, + { + "epoch": 7.783995523223279, + "grad_norm": 0.3199642300605774, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.008, + "step": 13910 + }, + { + "epoch": 7.789591494124231, + "grad_norm": 0.16800075769424438, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0054, + "step": 13920 + }, + { + "epoch": 7.795187465025182, + "grad_norm": 0.11445470154285431, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0067, + "step": 13930 + }, + { + "epoch": 7.800783435926133, + "grad_norm": 0.1465342938899994, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0046, + "step": 13940 + }, + { + "epoch": 7.806379406827085, + "grad_norm": 0.10726216435432434, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.013, + "step": 13950 + }, + { + "epoch": 7.811975377728036, + "grad_norm": 0.2467944324016571, + "learning_rate": 2.29300341084631e-05, + "loss": 0.006, + "step": 13960 + }, + { + "epoch": 7.817571348628987, + "grad_norm": 0.18158167600631714, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0065, + "step": 13970 + }, + { + "epoch": 7.823167319529938, + "grad_norm": 0.1618615835905075, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0065, + "step": 13980 + }, + { + "epoch": 7.8287632904308895, + "grad_norm": 0.1069146990776062, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0105, + "step": 13990 + }, + { + "epoch": 7.834359261331841, + "grad_norm": 0.12003065645694733, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 7.839955232232793, + "grad_norm": 0.09423186630010605, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0061, + "step": 14010 + }, + { + "epoch": 7.845551203133744, + "grad_norm": 0.35245028138160706, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0091, + "step": 14020 + }, + { + "epoch": 7.851147174034695, + "grad_norm": 0.11108125001192093, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.007, + "step": 14030 + }, + { + "epoch": 7.856743144935646, + "grad_norm": 0.10527674853801727, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0042, + "step": 14040 + }, + { + "epoch": 7.862339115836598, + "grad_norm": 0.2263229489326477, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0056, + "step": 14050 + }, + { + "epoch": 7.867935086737549, + "grad_norm": 0.13221915066242218, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "epoch": 7.8735310576385, + "grad_norm": 0.17508424818515778, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.007, + "step": 14070 + }, + { + "epoch": 7.879127028539451, + "grad_norm": 0.24999241530895233, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0085, + "step": 14080 + }, + { + "epoch": 7.8847229994404024, + "grad_norm": 0.12810635566711426, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0084, + "step": 14090 + }, + { + "epoch": 7.8903189703413545, + "grad_norm": 0.22745615243911743, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0145, + "step": 14100 + }, + { + "epoch": 7.895914941242306, + "grad_norm": 0.0886574536561966, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0071, + "step": 14110 + }, + { + "epoch": 7.901510912143257, + "grad_norm": 0.18623238801956177, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0077, + "step": 14120 + }, + { + "epoch": 7.907106883044208, + "grad_norm": 0.20176784694194794, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.008, + "step": 14130 + }, + { + "epoch": 7.912702853945159, + "grad_norm": 0.18935443460941315, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0069, + "step": 14140 + }, + { + "epoch": 7.918298824846111, + "grad_norm": 0.18812550604343414, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.0051, + "step": 14150 + }, + { + "epoch": 7.923894795747062, + "grad_norm": 0.12224578857421875, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0053, + "step": 14160 + }, + { + "epoch": 7.929490766648014, + "grad_norm": 0.12175440043210983, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0075, + "step": 14170 + }, + { + "epoch": 7.935086737548965, + "grad_norm": 0.11878049373626709, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0062, + "step": 14180 + }, + { + "epoch": 7.940682708449916, + "grad_norm": 0.19284716248512268, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0063, + "step": 14190 + }, + { + "epoch": 7.9462786793508675, + "grad_norm": 0.159319207072258, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0064, + "step": 14200 + }, + { + "epoch": 7.951874650251819, + "grad_norm": 0.16800148785114288, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0054, + "step": 14210 + }, + { + "epoch": 7.95747062115277, + "grad_norm": 0.23277972638607025, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0056, + "step": 14220 + }, + { + "epoch": 7.963066592053721, + "grad_norm": 0.08594591915607452, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0054, + "step": 14230 + }, + { + "epoch": 7.968662562954672, + "grad_norm": 0.21676327288150787, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0071, + "step": 14240 + }, + { + "epoch": 7.9742585338556236, + "grad_norm": 0.1778232604265213, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0067, + "step": 14250 + }, + { + "epoch": 7.979854504756576, + "grad_norm": 0.2177736759185791, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0068, + "step": 14260 + }, + { + "epoch": 7.985450475657527, + "grad_norm": 0.25127291679382324, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0104, + "step": 14270 + }, + { + "epoch": 7.991046446558478, + "grad_norm": 0.08792544901371002, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0064, + "step": 14280 + }, + { + "epoch": 7.996642417459429, + "grad_norm": 0.1125119999051094, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0099, + "step": 14290 + }, + { + "epoch": 8.00223838836038, + "grad_norm": 0.18561410903930664, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0089, + "step": 14300 + }, + { + "epoch": 8.007834359261333, + "grad_norm": 0.10987678915262222, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0071, + "step": 14310 + }, + { + "epoch": 8.013430330162283, + "grad_norm": 0.10636857897043228, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 8.019026301063235, + "grad_norm": 0.16379332542419434, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.006, + "step": 14330 + }, + { + "epoch": 8.024622271964185, + "grad_norm": 0.09951362758874893, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0052, + "step": 14340 + }, + { + "epoch": 8.030218242865137, + "grad_norm": 0.1974375694990158, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0061, + "step": 14350 + }, + { + "epoch": 8.035814213766088, + "grad_norm": 0.16213521361351013, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0056, + "step": 14360 + }, + { + "epoch": 8.04141018466704, + "grad_norm": 0.32907333970069885, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.011, + "step": 14370 + }, + { + "epoch": 8.047006155567992, + "grad_norm": 0.24763990938663483, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.0059, + "step": 14380 + }, + { + "epoch": 8.052602126468942, + "grad_norm": 0.06522991508245468, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0075, + "step": 14390 + }, + { + "epoch": 8.058198097369894, + "grad_norm": 0.1594466120004654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.0107, + "step": 14400 + }, + { + "epoch": 8.063794068270845, + "grad_norm": 0.11248297244310379, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0043, + "step": 14410 + }, + { + "epoch": 8.069390039171797, + "grad_norm": 0.11491246521472931, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0043, + "step": 14420 + }, + { + "epoch": 8.074986010072747, + "grad_norm": 0.22106263041496277, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0055, + "step": 14430 + }, + { + "epoch": 8.0805819809737, + "grad_norm": 0.16226910054683685, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.0085, + "step": 14440 + }, + { + "epoch": 8.08617795187465, + "grad_norm": 0.09066546708345413, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0083, + "step": 14450 + }, + { + "epoch": 8.091773922775602, + "grad_norm": 0.11933751404285431, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 8.097369893676554, + "grad_norm": 0.1404096931219101, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0047, + "step": 14470 + }, + { + "epoch": 8.102965864577504, + "grad_norm": 0.2508150339126587, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0076, + "step": 14480 + }, + { + "epoch": 8.108561835478456, + "grad_norm": 0.17527352273464203, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0077, + "step": 14490 + }, + { + "epoch": 8.114157806379406, + "grad_norm": 0.11819542944431305, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0048, + "step": 14500 + }, + { + "epoch": 8.119753777280359, + "grad_norm": 0.17159508168697357, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0072, + "step": 14510 + }, + { + "epoch": 8.125349748181309, + "grad_norm": 0.1392519325017929, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 8.130945719082261, + "grad_norm": 0.2096053957939148, + "learning_rate": 1.909401999301993e-05, + "loss": 0.007, + "step": 14530 + }, + { + "epoch": 8.136541689983211, + "grad_norm": 0.2075774371623993, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0063, + "step": 14540 + }, + { + "epoch": 8.142137660884163, + "grad_norm": 0.0607825368642807, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0046, + "step": 14550 + }, + { + "epoch": 8.147733631785115, + "grad_norm": 0.20028991997241974, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0067, + "step": 14560 + }, + { + "epoch": 8.153329602686066, + "grad_norm": 0.12437421083450317, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 8.158925573587018, + "grad_norm": 0.09412521868944168, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0053, + "step": 14580 + }, + { + "epoch": 8.164521544487968, + "grad_norm": 0.30078214406967163, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0088, + "step": 14590 + }, + { + "epoch": 8.17011751538892, + "grad_norm": 0.2020367681980133, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0074, + "step": 14600 + }, + { + "epoch": 8.17571348628987, + "grad_norm": 0.12557435035705566, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 8.181309457190823, + "grad_norm": 0.11945895105600357, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0078, + "step": 14620 + }, + { + "epoch": 8.186905428091775, + "grad_norm": 0.33773839473724365, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0101, + "step": 14630 + }, + { + "epoch": 8.192501398992725, + "grad_norm": 0.2184380739927292, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0059, + "step": 14640 + }, + { + "epoch": 8.198097369893677, + "grad_norm": 0.06359529495239258, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.006, + "step": 14650 + }, + { + "epoch": 8.203693340794628, + "grad_norm": 0.1690957248210907, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.008, + "step": 14660 + }, + { + "epoch": 8.20928931169558, + "grad_norm": 0.296812504529953, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.0069, + "step": 14670 + }, + { + "epoch": 8.21488528259653, + "grad_norm": 0.08360179513692856, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 8.220481253497482, + "grad_norm": 0.12985661625862122, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0079, + "step": 14690 + }, + { + "epoch": 8.226077224398432, + "grad_norm": 0.10445982962846756, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0036, + "step": 14700 + }, + { + "epoch": 8.231673195299384, + "grad_norm": 0.1876983791589737, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0058, + "step": 14710 + }, + { + "epoch": 8.237269166200337, + "grad_norm": 0.07933235913515091, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0048, + "step": 14720 + }, + { + "epoch": 8.242865137101287, + "grad_norm": 0.2597578465938568, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0077, + "step": 14730 + }, + { + "epoch": 8.248461108002239, + "grad_norm": 0.1318414807319641, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0038, + "step": 14740 + }, + { + "epoch": 8.25405707890319, + "grad_norm": 0.23657891154289246, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0074, + "step": 14750 + }, + { + "epoch": 8.259653049804141, + "grad_norm": 0.12084835767745972, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0086, + "step": 14760 + }, + { + "epoch": 8.265249020705092, + "grad_norm": 0.2077346295118332, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0066, + "step": 14770 + }, + { + "epoch": 8.270844991606044, + "grad_norm": 0.26253417134284973, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0107, + "step": 14780 + }, + { + "epoch": 8.276440962506994, + "grad_norm": 0.25481829047203064, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "epoch": 8.282036933407946, + "grad_norm": 0.23157408833503723, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0069, + "step": 14800 + }, + { + "epoch": 8.287632904308898, + "grad_norm": 0.10085418075323105, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0036, + "step": 14810 + }, + { + "epoch": 8.293228875209849, + "grad_norm": 0.10876370966434479, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.007, + "step": 14820 + }, + { + "epoch": 8.2988248461108, + "grad_norm": 0.20935757458209991, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0055, + "step": 14830 + }, + { + "epoch": 8.304420817011751, + "grad_norm": 0.13824748992919922, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0072, + "step": 14840 + }, + { + "epoch": 8.310016787912703, + "grad_norm": 0.3369564414024353, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0069, + "step": 14850 + }, + { + "epoch": 8.315612758813653, + "grad_norm": 0.2684394419193268, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0061, + "step": 14860 + }, + { + "epoch": 8.321208729714606, + "grad_norm": 0.23020261526107788, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0126, + "step": 14870 + }, + { + "epoch": 8.326804700615558, + "grad_norm": 0.23905567824840546, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0057, + "step": 14880 + }, + { + "epoch": 8.332400671516508, + "grad_norm": 0.11183072626590729, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0061, + "step": 14890 + }, + { + "epoch": 8.33799664241746, + "grad_norm": 0.11450804024934769, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0041, + "step": 14900 + }, + { + "epoch": 8.34359261331841, + "grad_norm": 0.14276103675365448, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0046, + "step": 14910 + }, + { + "epoch": 8.349188584219362, + "grad_norm": 0.1129729300737381, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0063, + "step": 14920 + }, + { + "epoch": 8.354784555120313, + "grad_norm": 0.18285100162029266, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "epoch": 8.360380526021265, + "grad_norm": 0.10539596527814865, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0043, + "step": 14940 + }, + { + "epoch": 8.365976496922215, + "grad_norm": 0.13819168508052826, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0065, + "step": 14950 + }, + { + "epoch": 8.371572467823167, + "grad_norm": 0.19076746702194214, + "learning_rate": 1.637995976326527e-05, + "loss": 0.01, + "step": 14960 + }, + { + "epoch": 8.37716843872412, + "grad_norm": 0.24138867855072021, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0082, + "step": 14970 + }, + { + "epoch": 8.38276440962507, + "grad_norm": 0.1397552490234375, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0047, + "step": 14980 + }, + { + "epoch": 8.388360380526022, + "grad_norm": 0.08066073060035706, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0076, + "step": 14990 + }, + { + "epoch": 8.393956351426972, + "grad_norm": 0.2772653102874756, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.008, + "step": 15000 + }, + { + "epoch": 8.399552322327924, + "grad_norm": 0.1933654099702835, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 8.405148293228875, + "grad_norm": 0.09738892316818237, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0046, + "step": 15020 + }, + { + "epoch": 8.410744264129827, + "grad_norm": 0.11632133275270462, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0079, + "step": 15030 + }, + { + "epoch": 8.416340235030777, + "grad_norm": 0.10637476295232773, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0052, + "step": 15040 + }, + { + "epoch": 8.421936205931729, + "grad_norm": 0.22587163746356964, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0064, + "step": 15050 + }, + { + "epoch": 8.427532176832681, + "grad_norm": 0.15165762603282928, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0068, + "step": 15060 + }, + { + "epoch": 8.433128147733632, + "grad_norm": 0.13497453927993774, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0076, + "step": 15070 + }, + { + "epoch": 8.438724118634584, + "grad_norm": 0.1444980800151825, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0066, + "step": 15080 + }, + { + "epoch": 8.444320089535534, + "grad_norm": 0.32101383805274963, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0054, + "step": 15090 + }, + { + "epoch": 8.449916060436486, + "grad_norm": 0.26894599199295044, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 8.455512031337436, + "grad_norm": 0.26109951734542847, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.0057, + "step": 15110 + }, + { + "epoch": 8.461108002238388, + "grad_norm": 0.09691357612609863, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0036, + "step": 15120 + }, + { + "epoch": 8.46670397313934, + "grad_norm": 0.08756586909294128, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0059, + "step": 15130 + }, + { + "epoch": 8.47229994404029, + "grad_norm": 0.0936000794172287, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0042, + "step": 15140 + }, + { + "epoch": 8.477895914941243, + "grad_norm": 0.20747262239456177, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0118, + "step": 15150 + }, + { + "epoch": 8.483491885842193, + "grad_norm": 0.11843043565750122, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0072, + "step": 15160 + }, + { + "epoch": 8.489087856743145, + "grad_norm": 0.23854964971542358, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0066, + "step": 15170 + }, + { + "epoch": 8.494683827644096, + "grad_norm": 0.1770446002483368, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0062, + "step": 15180 + }, + { + "epoch": 8.500279798545048, + "grad_norm": 0.23799461126327515, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.0075, + "step": 15190 + }, + { + "epoch": 8.505875769445998, + "grad_norm": 0.3780512511730194, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0081, + "step": 15200 + }, + { + "epoch": 8.51147174034695, + "grad_norm": 0.11119966208934784, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0046, + "step": 15210 + }, + { + "epoch": 8.517067711247902, + "grad_norm": 0.09658356010913849, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.007, + "step": 15220 + }, + { + "epoch": 8.522663682148853, + "grad_norm": 0.17144611477851868, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 8.528259653049805, + "grad_norm": 0.14623138308525085, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0051, + "step": 15240 + }, + { + "epoch": 8.533855623950755, + "grad_norm": 0.1579722911119461, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0049, + "step": 15250 + }, + { + "epoch": 8.539451594851707, + "grad_norm": 0.14990709722042084, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 8.545047565752657, + "grad_norm": 0.24695487320423126, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.0072, + "step": 15270 + }, + { + "epoch": 8.55064353665361, + "grad_norm": 0.2550105154514313, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0058, + "step": 15280 + }, + { + "epoch": 8.556239507554562, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0089, + "step": 15290 + }, + { + "epoch": 8.561835478455512, + "grad_norm": 0.3639971613883972, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0085, + "step": 15300 + }, + { + "epoch": 8.567431449356464, + "grad_norm": 0.12687824666500092, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0061, + "step": 15310 + }, + { + "epoch": 8.573027420257414, + "grad_norm": 0.1352899670600891, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0066, + "step": 15320 + }, + { + "epoch": 8.578623391158366, + "grad_norm": 0.2139214277267456, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.006, + "step": 15330 + }, + { + "epoch": 8.584219362059317, + "grad_norm": 0.13526403903961182, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0061, + "step": 15340 + }, + { + "epoch": 8.589815332960269, + "grad_norm": 0.10206010937690735, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 8.59541130386122, + "grad_norm": 0.1680195927619934, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0063, + "step": 15360 + }, + { + "epoch": 8.601007274762171, + "grad_norm": 0.2092961072921753, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0089, + "step": 15370 + }, + { + "epoch": 8.606603245663123, + "grad_norm": 0.24639266729354858, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 8.612199216564074, + "grad_norm": 0.20889121294021606, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0036, + "step": 15390 + }, + { + "epoch": 8.617795187465026, + "grad_norm": 0.1731806993484497, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0049, + "step": 15400 + }, + { + "epoch": 8.623391158365976, + "grad_norm": 0.15791241824626923, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0064, + "step": 15410 + }, + { + "epoch": 8.628987129266928, + "grad_norm": 0.2612980604171753, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 8.634583100167879, + "grad_norm": 0.12942969799041748, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0044, + "step": 15430 + }, + { + "epoch": 8.64017907106883, + "grad_norm": 0.224086731672287, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0065, + "step": 15440 + }, + { + "epoch": 8.645775041969781, + "grad_norm": 0.234924778342247, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0055, + "step": 15450 + }, + { + "epoch": 8.651371012870733, + "grad_norm": 0.14701171219348907, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0078, + "step": 15460 + }, + { + "epoch": 8.656966983771685, + "grad_norm": 0.06672263145446777, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 8.662562954672635, + "grad_norm": 0.11234284192323685, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.0053, + "step": 15480 + }, + { + "epoch": 8.668158925573588, + "grad_norm": 0.2150266021490097, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.0045, + "step": 15490 + }, + { + "epoch": 8.673754896474538, + "grad_norm": 0.37493982911109924, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 8.67935086737549, + "grad_norm": 0.054848652333021164, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0062, + "step": 15510 + }, + { + "epoch": 8.68494683827644, + "grad_norm": 0.30526259541511536, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.005, + "step": 15520 + }, + { + "epoch": 8.690542809177392, + "grad_norm": 0.11414709687232971, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0055, + "step": 15530 + }, + { + "epoch": 8.696138780078343, + "grad_norm": 0.19409357011318207, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0048, + "step": 15540 + }, + { + "epoch": 8.701734750979295, + "grad_norm": 0.0840323343873024, + "learning_rate": 1.293509864018146e-05, + "loss": 0.0062, + "step": 15550 + }, + { + "epoch": 8.707330721880247, + "grad_norm": 0.2921426594257355, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0055, + "step": 15560 + }, + { + "epoch": 8.712926692781197, + "grad_norm": 0.18921242654323578, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0065, + "step": 15570 + }, + { + "epoch": 8.71852266368215, + "grad_norm": 0.2043517678976059, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0048, + "step": 15580 + }, + { + "epoch": 8.7241186345831, + "grad_norm": 0.14090007543563843, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0043, + "step": 15590 + }, + { + "epoch": 8.729714605484052, + "grad_norm": 0.13512486219406128, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.009, + "step": 15600 + }, + { + "epoch": 8.735310576385002, + "grad_norm": 0.16850633919239044, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0043, + "step": 15610 + }, + { + "epoch": 8.740906547285954, + "grad_norm": 0.0671689510345459, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 8.746502518186904, + "grad_norm": 0.17265447974205017, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0078, + "step": 15630 + }, + { + "epoch": 8.752098489087857, + "grad_norm": 0.09633443504571915, + "learning_rate": 1.243982485229559e-05, + "loss": 0.01, + "step": 15640 + }, + { + "epoch": 8.757694459988809, + "grad_norm": 0.07608158886432648, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.008, + "step": 15650 + }, + { + "epoch": 8.763290430889759, + "grad_norm": 0.1386493295431137, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0141, + "step": 15660 + }, + { + "epoch": 8.768886401790711, + "grad_norm": 0.22368523478507996, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 8.774482372691661, + "grad_norm": 0.1423027664422989, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0059, + "step": 15680 + }, + { + "epoch": 8.780078343592614, + "grad_norm": 0.09472924470901489, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0078, + "step": 15690 + }, + { + "epoch": 8.785674314493564, + "grad_norm": 0.1385987550020218, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0044, + "step": 15700 + }, + { + "epoch": 8.791270285394516, + "grad_norm": 0.1465826779603958, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0035, + "step": 15710 + }, + { + "epoch": 8.796866256295468, + "grad_norm": 0.3275586664676666, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0057, + "step": 15720 + }, + { + "epoch": 8.802462227196418, + "grad_norm": 0.09989197552204132, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.006, + "step": 15730 + }, + { + "epoch": 8.80805819809737, + "grad_norm": 0.12719599902629852, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0042, + "step": 15740 + }, + { + "epoch": 8.81365416899832, + "grad_norm": 0.3480566740036011, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.0073, + "step": 15750 + }, + { + "epoch": 8.819250139899273, + "grad_norm": 0.1562948226928711, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0066, + "step": 15760 + }, + { + "epoch": 8.824846110800223, + "grad_norm": 0.23169738054275513, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0067, + "step": 15770 + }, + { + "epoch": 8.830442081701175, + "grad_norm": 0.1328081339597702, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0051, + "step": 15780 + }, + { + "epoch": 8.836038052602127, + "grad_norm": 0.10535513609647751, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0061, + "step": 15790 + }, + { + "epoch": 8.841634023503078, + "grad_norm": 0.08534829318523407, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 8.84722999440403, + "grad_norm": 0.21224470436573029, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0066, + "step": 15810 + }, + { + "epoch": 8.85282596530498, + "grad_norm": 0.20451109111309052, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0067, + "step": 15820 + }, + { + "epoch": 8.858421936205932, + "grad_norm": 0.21763543784618378, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0089, + "step": 15830 + }, + { + "epoch": 8.864017907106883, + "grad_norm": 0.14212079346179962, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0048, + "step": 15840 + }, + { + "epoch": 8.869613878007835, + "grad_norm": 0.1489504873752594, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0046, + "step": 15850 + }, + { + "epoch": 8.875209848908785, + "grad_norm": 0.09450363367795944, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 8.880805819809737, + "grad_norm": 0.2034289836883545, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0081, + "step": 15870 + }, + { + "epoch": 8.88640179071069, + "grad_norm": 0.13935258984565735, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0064, + "step": 15880 + }, + { + "epoch": 8.89199776161164, + "grad_norm": 0.08578619360923767, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0111, + "step": 15890 + }, + { + "epoch": 8.897593732512592, + "grad_norm": 0.0884697362780571, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0108, + "step": 15900 + }, + { + "epoch": 8.903189703413542, + "grad_norm": 0.28323593735694885, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0116, + "step": 15910 + }, + { + "epoch": 8.908785674314494, + "grad_norm": 0.14971330761909485, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0079, + "step": 15920 + }, + { + "epoch": 8.914381645215444, + "grad_norm": 0.18808728456497192, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0082, + "step": 15930 + }, + { + "epoch": 8.919977616116396, + "grad_norm": 0.16924549639225006, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0064, + "step": 15940 + }, + { + "epoch": 8.925573587017347, + "grad_norm": 0.1466728150844574, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0106, + "step": 15950 + }, + { + "epoch": 8.931169557918299, + "grad_norm": 0.18614622950553894, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.0089, + "step": 15960 + }, + { + "epoch": 8.936765528819251, + "grad_norm": 0.04307910427451134, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0092, + "step": 15970 + }, + { + "epoch": 8.942361499720201, + "grad_norm": 0.20207299292087555, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0077, + "step": 15980 + }, + { + "epoch": 8.947957470621153, + "grad_norm": 0.12527382373809814, + "learning_rate": 1.059380755314613e-05, + "loss": 0.008, + "step": 15990 + }, + { + "epoch": 8.953553441522104, + "grad_norm": 0.3143978416919708, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0061, + "step": 16000 + }, + { + "epoch": 8.959149412423056, + "grad_norm": 0.0894945040345192, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0041, + "step": 16010 + }, + { + "epoch": 8.964745383324006, + "grad_norm": 0.23625624179840088, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0067, + "step": 16020 + }, + { + "epoch": 8.970341354224958, + "grad_norm": 0.14668506383895874, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 8.975937325125908, + "grad_norm": 0.17659400403499603, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0066, + "step": 16040 + }, + { + "epoch": 8.98153329602686, + "grad_norm": 0.2076718956232071, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0074, + "step": 16050 + }, + { + "epoch": 8.987129266927813, + "grad_norm": 0.1386403888463974, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 8.992725237828763, + "grad_norm": 0.12247960269451141, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0059, + "step": 16070 + }, + { + "epoch": 8.998321208729715, + "grad_norm": 0.08335962146520615, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0045, + "step": 16080 + }, + { + "epoch": 9.003917179630665, + "grad_norm": 0.13206073641777039, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0049, + "step": 16090 + }, + { + "epoch": 9.009513150531617, + "grad_norm": 0.1473735272884369, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0035, + "step": 16100 + }, + { + "epoch": 9.015109121432568, + "grad_norm": 0.07891960442066193, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 9.02070509233352, + "grad_norm": 0.16706879436969757, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0039, + "step": 16120 + }, + { + "epoch": 9.026301063234472, + "grad_norm": 0.0876656025648117, + "learning_rate": 9.892011949106172e-06, + "loss": 0.008, + "step": 16130 + }, + { + "epoch": 9.031897034135422, + "grad_norm": 0.10205890983343124, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0071, + "step": 16140 + }, + { + "epoch": 9.037493005036374, + "grad_norm": 0.16774903237819672, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0037, + "step": 16150 + }, + { + "epoch": 9.043088975937325, + "grad_norm": 0.2676295340061188, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0062, + "step": 16160 + }, + { + "epoch": 9.048684946838277, + "grad_norm": 0.2096317857503891, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0036, + "step": 16170 + }, + { + "epoch": 9.054280917739227, + "grad_norm": 0.09436144679784775, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0045, + "step": 16180 + }, + { + "epoch": 9.05987688864018, + "grad_norm": 0.17315761744976044, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0064, + "step": 16190 + }, + { + "epoch": 9.06547285954113, + "grad_norm": 0.07326121628284454, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0035, + "step": 16200 + }, + { + "epoch": 9.071068830442082, + "grad_norm": 0.14720216393470764, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0055, + "step": 16210 + }, + { + "epoch": 9.076664801343034, + "grad_norm": 0.0691135823726654, + "learning_rate": 9.452183123004e-06, + "loss": 0.0077, + "step": 16220 + }, + { + "epoch": 9.082260772243984, + "grad_norm": 0.13588427007198334, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0046, + "step": 16230 + }, + { + "epoch": 9.087856743144936, + "grad_norm": 0.13852879405021667, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 9.093452714045887, + "grad_norm": 0.0626252144575119, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0069, + "step": 16250 + }, + { + "epoch": 9.099048684946839, + "grad_norm": 0.25919991731643677, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0046, + "step": 16260 + }, + { + "epoch": 9.104644655847789, + "grad_norm": 0.0894588977098465, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0046, + "step": 16270 + }, + { + "epoch": 9.110240626748741, + "grad_norm": 0.45610806345939636, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0069, + "step": 16280 + }, + { + "epoch": 9.115836597649691, + "grad_norm": 0.1714649349451065, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0058, + "step": 16290 + }, + { + "epoch": 9.121432568550643, + "grad_norm": 0.20788055658340454, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0046, + "step": 16300 + }, + { + "epoch": 9.127028539451596, + "grad_norm": 0.13281454145908356, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0073, + "step": 16310 + }, + { + "epoch": 9.132624510352546, + "grad_norm": 0.20327745378017426, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0061, + "step": 16320 + }, + { + "epoch": 9.138220481253498, + "grad_norm": 0.1418776661157608, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0054, + "step": 16330 + }, + { + "epoch": 9.143816452154448, + "grad_norm": 0.08617481589317322, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0057, + "step": 16340 + }, + { + "epoch": 9.1494124230554, + "grad_norm": 0.14362642168998718, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0059, + "step": 16350 + }, + { + "epoch": 9.15500839395635, + "grad_norm": 0.05195459723472595, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0063, + "step": 16360 + }, + { + "epoch": 9.160604364857303, + "grad_norm": 0.1765775829553604, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0075, + "step": 16370 + }, + { + "epoch": 9.166200335758255, + "grad_norm": 0.1646648496389389, + "learning_rate": 8.692497190280224e-06, + "loss": 0.0065, + "step": 16380 + }, + { + "epoch": 9.171796306659205, + "grad_norm": 0.16203129291534424, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0049, + "step": 16390 + }, + { + "epoch": 9.177392277560157, + "grad_norm": 0.07584717124700546, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 9.182988248461108, + "grad_norm": 0.14030073583126068, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0033, + "step": 16410 + }, + { + "epoch": 9.18858421936206, + "grad_norm": 0.09595508873462677, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0034, + "step": 16420 + }, + { + "epoch": 9.19418019026301, + "grad_norm": 0.2346934825181961, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0091, + "step": 16430 + }, + { + "epoch": 9.199776161163962, + "grad_norm": 0.11720654368400574, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0056, + "step": 16440 + }, + { + "epoch": 9.205372132064912, + "grad_norm": 0.18266266584396362, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0044, + "step": 16450 + }, + { + "epoch": 9.210968102965865, + "grad_norm": 0.11530566215515137, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0091, + "step": 16460 + }, + { + "epoch": 9.216564073866817, + "grad_norm": 0.15264108777046204, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0056, + "step": 16470 + }, + { + "epoch": 9.222160044767767, + "grad_norm": 0.12221037596464157, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0046, + "step": 16480 + }, + { + "epoch": 9.227756015668719, + "grad_norm": 0.18118728697299957, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0048, + "step": 16490 + }, + { + "epoch": 9.23335198656967, + "grad_norm": 0.11156457662582397, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0038, + "step": 16500 + }, + { + "epoch": 9.238947957470621, + "grad_norm": 0.24215921759605408, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0053, + "step": 16510 + }, + { + "epoch": 9.244543928371572, + "grad_norm": 0.1042838767170906, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0038, + "step": 16520 + }, + { + "epoch": 9.250139899272524, + "grad_norm": 0.15312840044498444, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0056, + "step": 16530 + }, + { + "epoch": 9.255735870173474, + "grad_norm": 0.12216275930404663, + "learning_rate": 7.96172046062032e-06, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 9.261331841074426, + "grad_norm": 0.14912450313568115, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0104, + "step": 16550 + }, + { + "epoch": 9.266927811975378, + "grad_norm": 0.2108585089445114, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0068, + "step": 16560 + }, + { + "epoch": 9.272523782876329, + "grad_norm": 0.0906781554222107, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0044, + "step": 16570 + }, + { + "epoch": 9.27811975377728, + "grad_norm": 0.13947215676307678, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0057, + "step": 16580 + }, + { + "epoch": 9.283715724678231, + "grad_norm": 0.11844757199287415, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0044, + "step": 16590 + }, + { + "epoch": 9.289311695579183, + "grad_norm": 0.13809189200401306, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0064, + "step": 16600 + }, + { + "epoch": 9.294907666480134, + "grad_norm": 0.33429670333862305, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0033, + "step": 16610 + }, + { + "epoch": 9.300503637381086, + "grad_norm": 0.15949353575706482, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0142, + "step": 16620 + }, + { + "epoch": 9.306099608282038, + "grad_norm": 0.30085355043411255, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0064, + "step": 16630 + }, + { + "epoch": 9.311695579182988, + "grad_norm": 0.09114662557840347, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0062, + "step": 16640 + }, + { + "epoch": 9.31729155008394, + "grad_norm": 0.3248306214809418, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0063, + "step": 16650 + }, + { + "epoch": 9.32288752098489, + "grad_norm": 0.15951383113861084, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0069, + "step": 16660 + }, + { + "epoch": 9.328483491885843, + "grad_norm": 0.1393985003232956, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0046, + "step": 16670 + }, + { + "epoch": 9.334079462786793, + "grad_norm": 0.14699183404445648, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.0047, + "step": 16680 + }, + { + "epoch": 9.339675433687745, + "grad_norm": 0.14034835994243622, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0068, + "step": 16690 + }, + { + "epoch": 9.345271404588695, + "grad_norm": 0.202976793050766, + "learning_rate": 7.260364370723044e-06, + "loss": 0.007, + "step": 16700 + }, + { + "epoch": 9.350867375489647, + "grad_norm": 0.1574084311723709, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0037, + "step": 16710 + }, + { + "epoch": 9.3564633463906, + "grad_norm": 0.23007866740226746, + "learning_rate": 7.174788171373731e-06, + "loss": 0.006, + "step": 16720 + }, + { + "epoch": 9.36205931729155, + "grad_norm": 0.06488067656755447, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0038, + "step": 16730 + }, + { + "epoch": 9.367655288192502, + "grad_norm": 0.08520302921533585, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0035, + "step": 16740 + }, + { + "epoch": 9.373251259093452, + "grad_norm": 0.1132565289735794, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0033, + "step": 16750 + }, + { + "epoch": 9.378847229994404, + "grad_norm": 0.29900556802749634, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0083, + "step": 16760 + }, + { + "epoch": 9.384443200895355, + "grad_norm": 0.21089625358581543, + "learning_rate": 6.962898649802823e-06, + "loss": 0.004, + "step": 16770 + }, + { + "epoch": 9.390039171796307, + "grad_norm": 0.1411179006099701, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0051, + "step": 16780 + }, + { + "epoch": 9.395635142697259, + "grad_norm": 0.20569784939289093, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0057, + "step": 16790 + }, + { + "epoch": 9.40123111359821, + "grad_norm": 0.13673344254493713, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0029, + "step": 16800 + }, + { + "epoch": 9.406827084499161, + "grad_norm": 0.07221835851669312, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 9.412423055400112, + "grad_norm": 0.15173490345478058, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0055, + "step": 16820 + }, + { + "epoch": 9.418019026301064, + "grad_norm": 0.12818996608257294, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 9.423614997202014, + "grad_norm": 0.2607164978981018, + "learning_rate": 6.671198293627479e-06, + "loss": 0.0062, + "step": 16840 + }, + { + "epoch": 9.429210968102966, + "grad_norm": 0.1782405823469162, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0038, + "step": 16850 + }, + { + "epoch": 9.434806939003916, + "grad_norm": 0.1047229990363121, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0038, + "step": 16860 + }, + { + "epoch": 9.440402909904869, + "grad_norm": 0.21528460085391998, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0044, + "step": 16870 + }, + { + "epoch": 9.44599888080582, + "grad_norm": 0.033052559942007065, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0043, + "step": 16880 + }, + { + "epoch": 9.451594851706771, + "grad_norm": 0.08729052543640137, + "learning_rate": 6.466392388350695e-06, + "loss": 0.0067, + "step": 16890 + }, + { + "epoch": 9.457190822607723, + "grad_norm": 0.1754913330078125, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0038, + "step": 16900 + }, + { + "epoch": 9.462786793508673, + "grad_norm": 0.13821344077587128, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 9.468382764409625, + "grad_norm": 0.1275906264781952, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0036, + "step": 16920 + }, + { + "epoch": 9.473978735310576, + "grad_norm": 0.14954271912574768, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0034, + "step": 16930 + }, + { + "epoch": 9.479574706211528, + "grad_norm": 0.12982557713985443, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0043, + "step": 16940 + }, + { + "epoch": 9.485170677112478, + "grad_norm": 0.06912703812122345, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0055, + "step": 16950 + }, + { + "epoch": 9.49076664801343, + "grad_norm": 0.19244985282421112, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0043, + "step": 16960 + }, + { + "epoch": 9.496362618914382, + "grad_norm": 0.08739597350358963, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 9.501958589815333, + "grad_norm": 0.11885930597782135, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.0073, + "step": 16980 + }, + { + "epoch": 9.507554560716285, + "grad_norm": 0.07571222633123398, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.0032, + "step": 16990 + }, + { + "epoch": 9.513150531617235, + "grad_norm": 0.07605729252099991, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0036, + "step": 17000 + }, + { + "epoch": 9.518746502518187, + "grad_norm": 0.20224310457706451, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0055, + "step": 17010 + }, + { + "epoch": 9.524342473419138, + "grad_norm": 0.09693833440542221, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0048, + "step": 17020 + }, + { + "epoch": 9.52993844432009, + "grad_norm": 0.13180632889270782, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 9.53553441522104, + "grad_norm": 0.18198780715465546, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 9.541130386121992, + "grad_norm": 0.08620735257863998, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 9.546726357022944, + "grad_norm": 0.18070462346076965, + "learning_rate": 5.79239090328883e-06, + "loss": 0.005, + "step": 17060 + }, + { + "epoch": 9.552322327923894, + "grad_norm": 0.13954901695251465, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 9.557918298824847, + "grad_norm": 0.08338068425655365, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 9.563514269725797, + "grad_norm": 0.1206720620393753, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0071, + "step": 17090 + }, + { + "epoch": 9.569110240626749, + "grad_norm": 0.1978680044412613, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0048, + "step": 17100 + }, + { + "epoch": 9.5747062115277, + "grad_norm": 0.2190864086151123, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0043, + "step": 17110 + }, + { + "epoch": 9.580302182428651, + "grad_norm": 0.0734127014875412, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0049, + "step": 17120 + }, + { + "epoch": 9.585898153329603, + "grad_norm": 0.14367960393428802, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0035, + "step": 17130 + }, + { + "epoch": 9.591494124230554, + "grad_norm": 0.26178881525993347, + "learning_rate": 5.48726765236629e-06, + "loss": 0.005, + "step": 17140 + }, + { + "epoch": 9.597090095131506, + "grad_norm": 0.10900067538022995, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0065, + "step": 17150 + }, + { + "epoch": 9.602686066032456, + "grad_norm": 0.16984951496124268, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0051, + "step": 17160 + }, + { + "epoch": 9.608282036933408, + "grad_norm": 0.0894961804151535, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0038, + "step": 17170 + }, + { + "epoch": 9.613878007834359, + "grad_norm": 0.25771039724349976, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0073, + "step": 17180 + }, + { + "epoch": 9.61947397873531, + "grad_norm": 0.14566998183727264, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0046, + "step": 17190 + }, + { + "epoch": 9.625069949636263, + "grad_norm": 0.17133091390132904, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0073, + "step": 17200 + }, + { + "epoch": 9.630665920537213, + "grad_norm": 0.16593864560127258, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0053, + "step": 17210 + }, + { + "epoch": 9.636261891438165, + "grad_norm": 0.11243371665477753, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 9.641857862339116, + "grad_norm": 0.15767988562583923, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0072, + "step": 17230 + }, + { + "epoch": 9.647453833240068, + "grad_norm": 0.2645623981952667, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0052, + "step": 17240 + }, + { + "epoch": 9.653049804141018, + "grad_norm": 0.08610297739505768, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0056, + "step": 17250 + }, + { + "epoch": 9.65864577504197, + "grad_norm": 0.08036172389984131, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0036, + "step": 17260 + }, + { + "epoch": 9.66424174594292, + "grad_norm": 0.18519535660743713, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0057, + "step": 17270 + }, + { + "epoch": 9.669837716843872, + "grad_norm": 0.19542230665683746, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0036, + "step": 17280 + }, + { + "epoch": 9.675433687744825, + "grad_norm": 0.21911007165908813, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0039, + "step": 17290 + }, + { + "epoch": 9.681029658645775, + "grad_norm": 0.14355053007602692, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0052, + "step": 17300 + }, + { + "epoch": 9.686625629546727, + "grad_norm": 0.34103378653526306, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0063, + "step": 17310 + }, + { + "epoch": 9.692221600447677, + "grad_norm": 0.18420292437076569, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0046, + "step": 17320 + }, + { + "epoch": 9.69781757134863, + "grad_norm": 0.11074794083833694, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0056, + "step": 17330 + }, + { + "epoch": 9.70341354224958, + "grad_norm": 0.1706562340259552, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0038, + "step": 17340 + }, + { + "epoch": 9.709009513150532, + "grad_norm": 0.16499456763267517, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0046, + "step": 17350 + }, + { + "epoch": 9.714605484051482, + "grad_norm": 0.08222458511590958, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0062, + "step": 17360 + }, + { + "epoch": 9.720201454952434, + "grad_norm": 0.13518883287906647, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0086, + "step": 17370 + }, + { + "epoch": 9.725797425853386, + "grad_norm": 0.16546756029129028, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0051, + "step": 17380 + }, + { + "epoch": 9.731393396754337, + "grad_norm": 0.31760314106941223, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0056, + "step": 17390 + }, + { + "epoch": 9.736989367655289, + "grad_norm": 0.11362655460834503, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0085, + "step": 17400 + }, + { + "epoch": 9.742585338556239, + "grad_norm": 0.12480427324771881, + "learning_rate": 4.515278581627141e-06, + "loss": 0.003, + "step": 17410 + }, + { + "epoch": 9.748181309457191, + "grad_norm": 0.09458563476800919, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0043, + "step": 17420 + }, + { + "epoch": 9.753777280358142, + "grad_norm": 0.15045048296451569, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0035, + "step": 17430 + }, + { + "epoch": 9.759373251259094, + "grad_norm": 0.10754050314426422, + "learning_rate": 4.412841079266777e-06, + "loss": 0.0059, + "step": 17440 + }, + { + "epoch": 9.764969222160044, + "grad_norm": 0.09626353532075882, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0064, + "step": 17450 + }, + { + "epoch": 9.770565193060996, + "grad_norm": 0.0682365670800209, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0039, + "step": 17460 + }, + { + "epoch": 9.776161163961948, + "grad_norm": 0.0859832614660263, + "learning_rate": 4.311525276827682e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "epoch": 9.781757134862898, + "grad_norm": 0.057302311062812805, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0061, + "step": 17480 + }, + { + "epoch": 9.78735310576385, + "grad_norm": 0.30939188599586487, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0072, + "step": 17490 + }, + { + "epoch": 9.7929490766648, + "grad_norm": 0.06655000895261765, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.006, + "step": 17500 + }, + { + "epoch": 9.798545047565753, + "grad_norm": 0.07795148342847824, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0064, + "step": 17510 + }, + { + "epoch": 9.804141018466703, + "grad_norm": 0.06218419224023819, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 9.809736989367655, + "grad_norm": 0.064509816467762, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0048, + "step": 17530 + }, + { + "epoch": 9.815332960268606, + "grad_norm": 0.2096703052520752, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0049, + "step": 17540 + }, + { + "epoch": 9.820928931169558, + "grad_norm": 0.15621553361415863, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0046, + "step": 17550 + }, + { + "epoch": 9.82652490207051, + "grad_norm": 0.089202381670475, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0078, + "step": 17560 + }, + { + "epoch": 9.83212087297146, + "grad_norm": 0.11227259039878845, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0046, + "step": 17570 + }, + { + "epoch": 9.837716843872412, + "grad_norm": 0.038788773119449615, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0093, + "step": 17580 + }, + { + "epoch": 9.843312814773363, + "grad_norm": 0.1287786364555359, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0041, + "step": 17590 + }, + { + "epoch": 9.848908785674315, + "grad_norm": 0.04712485149502754, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0046, + "step": 17600 + }, + { + "epoch": 9.854504756575265, + "grad_norm": 0.24810890853405, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0112, + "step": 17610 + }, + { + "epoch": 9.860100727476217, + "grad_norm": 0.16745951771736145, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0038, + "step": 17620 + }, + { + "epoch": 9.86569669837717, + "grad_norm": 0.10218873620033264, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0037, + "step": 17630 + }, + { + "epoch": 9.87129266927812, + "grad_norm": 0.19612161815166473, + "learning_rate": 3.75870883930306e-06, + "loss": 0.004, + "step": 17640 + }, + { + "epoch": 9.876888640179072, + "grad_norm": 0.20635591447353363, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0081, + "step": 17650 + }, + { + "epoch": 9.882484611080022, + "grad_norm": 0.154740571975708, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0049, + "step": 17660 + }, + { + "epoch": 9.888080581980974, + "grad_norm": 0.046477749943733215, + "learning_rate": 3.664933406085402e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "epoch": 9.893676552881924, + "grad_norm": 0.20742470026016235, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 9.899272523782876, + "grad_norm": 0.07390665262937546, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 9.904868494683829, + "grad_norm": 0.12964075803756714, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 9.910464465584779, + "grad_norm": 0.05571340024471283, + "learning_rate": 3.541673382905558e-06, + "loss": 0.008, + "step": 17710 + }, + { + "epoch": 9.916060436485731, + "grad_norm": 0.12276771664619446, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0069, + "step": 17720 + }, + { + "epoch": 9.921656407386681, + "grad_norm": 0.09888763725757599, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0036, + "step": 17730 + }, + { + "epoch": 9.927252378287633, + "grad_norm": 0.08338962495326996, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.004, + "step": 17740 + }, + { + "epoch": 9.932848349188584, + "grad_norm": 0.06845631450414658, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0037, + "step": 17750 + }, + { + "epoch": 9.938444320089536, + "grad_norm": 0.072002112865448, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0049, + "step": 17760 + }, + { + "epoch": 9.944040290990486, + "grad_norm": 0.13706427812576294, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "epoch": 9.949636261891438, + "grad_norm": 0.14595244824886322, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0047, + "step": 17780 + }, + { + "epoch": 9.95523223279239, + "grad_norm": 0.07961612939834595, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0035, + "step": 17790 + }, + { + "epoch": 9.96082820369334, + "grad_norm": 0.10814230144023895, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0048, + "step": 17800 + }, + { + "epoch": 9.966424174594293, + "grad_norm": 0.11842755228281021, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0095, + "step": 17810 + }, + { + "epoch": 9.972020145495243, + "grad_norm": 0.21332372725009918, + "learning_rate": 3.213202944713023e-06, + "loss": 0.003, + "step": 17820 + }, + { + "epoch": 9.977616116396195, + "grad_norm": 0.06386691331863403, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.0036, + "step": 17830 + }, + { + "epoch": 9.983212087297145, + "grad_norm": 0.08316194266080856, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0042, + "step": 17840 + }, + { + "epoch": 9.988808058198098, + "grad_norm": 0.16622905433177948, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0039, + "step": 17850 + }, + { + "epoch": 9.994404029099048, + "grad_norm": 0.11861821264028549, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0048, + "step": 17860 + }, + { + "epoch": 10.0, + "grad_norm": 0.1722375601530075, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0026, + "step": 17870 + }, + { + "epoch": 10.005595970900952, + "grad_norm": 0.06653541326522827, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0048, + "step": 17880 + }, + { + "epoch": 10.011191941801902, + "grad_norm": 0.16646505892276764, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.004, + "step": 17890 + }, + { + "epoch": 10.016787912702855, + "grad_norm": 0.07118295133113861, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0066, + "step": 17900 + }, + { + "epoch": 10.022383883603805, + "grad_norm": 0.15453752875328064, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0052, + "step": 17910 + }, + { + "epoch": 10.027979854504757, + "grad_norm": 0.23914295434951782, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0042, + "step": 17920 + }, + { + "epoch": 10.033575825405707, + "grad_norm": 0.09927842766046524, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0053, + "step": 17930 + }, + { + "epoch": 10.03917179630666, + "grad_norm": 0.039526671171188354, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0029, + "step": 17940 + }, + { + "epoch": 10.04476776720761, + "grad_norm": 0.1683174967765808, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0042, + "step": 17950 + }, + { + "epoch": 10.050363738108562, + "grad_norm": 0.10315953940153122, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0096, + "step": 17960 + }, + { + "epoch": 10.055959709009514, + "grad_norm": 0.17959141731262207, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0048, + "step": 17970 + }, + { + "epoch": 10.061555679910464, + "grad_norm": 0.18458683788776398, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 10.067151650811416, + "grad_norm": 0.19159017503261566, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0054, + "step": 17990 + }, + { + "epoch": 10.072747621712367, + "grad_norm": 0.08318327367305756, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0053, + "step": 18000 + }, + { + "epoch": 10.078343592613319, + "grad_norm": 0.07472005486488342, + "learning_rate": 2.682358534298285e-06, + "loss": 0.006, + "step": 18010 + }, + { + "epoch": 10.083939563514269, + "grad_norm": 0.09040942043066025, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.004, + "step": 18020 + }, + { + "epoch": 10.089535534415221, + "grad_norm": 0.037220001220703125, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0038, + "step": 18030 + }, + { + "epoch": 10.095131505316173, + "grad_norm": 0.11240635067224503, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0031, + "step": 18040 + }, + { + "epoch": 10.100727476217124, + "grad_norm": 0.12259605526924133, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0049, + "step": 18050 + }, + { + "epoch": 10.106323447118076, + "grad_norm": 0.16758129000663757, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0066, + "step": 18060 + }, + { + "epoch": 10.111919418019026, + "grad_norm": 0.10704974085092545, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.0021, + "step": 18070 + }, + { + "epoch": 10.117515388919978, + "grad_norm": 0.19684171676635742, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0035, + "step": 18080 + }, + { + "epoch": 10.123111359820928, + "grad_norm": 0.25069093704223633, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0038, + "step": 18090 + }, + { + "epoch": 10.12870733072188, + "grad_norm": 0.15888355672359467, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0055, + "step": 18100 + }, + { + "epoch": 10.13430330162283, + "grad_norm": 0.1355513483285904, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0032, + "step": 18110 + }, + { + "epoch": 10.139899272523783, + "grad_norm": 0.09521888941526413, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0072, + "step": 18120 + }, + { + "epoch": 10.145495243424735, + "grad_norm": 0.18774038553237915, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0028, + "step": 18130 + }, + { + "epoch": 10.151091214325685, + "grad_norm": 0.06293562054634094, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0032, + "step": 18140 + }, + { + "epoch": 10.156687185226637, + "grad_norm": 0.25169095396995544, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0092, + "step": 18150 + }, + { + "epoch": 10.162283156127588, + "grad_norm": 0.08376752585172653, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0021, + "step": 18160 + }, + { + "epoch": 10.16787912702854, + "grad_norm": 0.06758670508861542, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 10.17347509792949, + "grad_norm": 0.06193256378173828, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0036, + "step": 18180 + }, + { + "epoch": 10.179071068830442, + "grad_norm": 0.21087805926799774, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0043, + "step": 18190 + }, + { + "epoch": 10.184667039731393, + "grad_norm": 0.09581877291202545, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0048, + "step": 18200 + }, + { + "epoch": 10.190263010632345, + "grad_norm": 0.23187117278575897, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0068, + "step": 18210 + }, + { + "epoch": 10.195858981533297, + "grad_norm": 0.1904383897781372, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0036, + "step": 18220 + }, + { + "epoch": 10.201454952434247, + "grad_norm": 0.04588289558887482, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0075, + "step": 18230 + }, + { + "epoch": 10.2070509233352, + "grad_norm": 0.05446457862854004, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 10.21264689423615, + "grad_norm": 0.12907229363918304, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 10.218242865137102, + "grad_norm": 0.0724627822637558, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0024, + "step": 18260 + }, + { + "epoch": 10.223838836038052, + "grad_norm": 0.11797565221786499, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0032, + "step": 18270 + }, + { + "epoch": 10.229434806939004, + "grad_norm": 0.1239556148648262, + "learning_rate": 2.008446509461498e-06, + "loss": 0.0038, + "step": 18280 + }, + { + "epoch": 10.235030777839956, + "grad_norm": 0.05614084377884865, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0041, + "step": 18290 + }, + { + "epoch": 10.240626748740906, + "grad_norm": 0.2968387007713318, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0092, + "step": 18300 + }, + { + "epoch": 10.246222719641858, + "grad_norm": 0.11529407650232315, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0059, + "step": 18310 + }, + { + "epoch": 10.251818690542809, + "grad_norm": 0.24037353694438934, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0045, + "step": 18320 + }, + { + "epoch": 10.257414661443761, + "grad_norm": 0.20346900820732117, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 10.263010632344711, + "grad_norm": 0.27883380651474, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0049, + "step": 18340 + }, + { + "epoch": 10.268606603245663, + "grad_norm": 0.12232355028390884, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0077, + "step": 18350 + }, + { + "epoch": 10.274202574146614, + "grad_norm": 0.09397400170564651, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0039, + "step": 18360 + }, + { + "epoch": 10.279798545047566, + "grad_norm": 0.13843244314193726, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.0059, + "step": 18370 + }, + { + "epoch": 10.285394515948518, + "grad_norm": 0.04927824065089226, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.003, + "step": 18380 + }, + { + "epoch": 10.290990486849468, + "grad_norm": 0.2848436236381531, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.004, + "step": 18390 + }, + { + "epoch": 10.29658645775042, + "grad_norm": 0.10808296501636505, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0072, + "step": 18400 + }, + { + "epoch": 10.30218242865137, + "grad_norm": 0.08363109827041626, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0029, + "step": 18410 + }, + { + "epoch": 10.307778399552323, + "grad_norm": 0.07970738410949707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0058, + "step": 18420 + }, + { + "epoch": 10.313374370453273, + "grad_norm": 0.06155739724636078, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0042, + "step": 18430 + }, + { + "epoch": 10.318970341354225, + "grad_norm": 0.15518154203891754, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.006, + "step": 18440 + }, + { + "epoch": 10.324566312255175, + "grad_norm": 0.06478218734264374, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0083, + "step": 18450 + }, + { + "epoch": 10.330162283156128, + "grad_norm": 0.11871203780174255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 10.33575825405708, + "grad_norm": 0.13164940476417542, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 10.34135422495803, + "grad_norm": 0.08165992051362991, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0065, + "step": 18480 + }, + { + "epoch": 10.346950195858982, + "grad_norm": 0.06479761004447937, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0044, + "step": 18490 + }, + { + "epoch": 10.352546166759932, + "grad_norm": 0.24080127477645874, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0085, + "step": 18500 + }, + { + "epoch": 10.358142137660884, + "grad_norm": 0.1411421000957489, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0045, + "step": 18510 + }, + { + "epoch": 10.363738108561835, + "grad_norm": 0.058580052107572556, + "learning_rate": 1.4896615227983468e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 10.369334079462787, + "grad_norm": 0.1638147383928299, + "learning_rate": 1.4696978433290653e-06, + "loss": 0.0054, + "step": 18530 + }, + { + "epoch": 10.374930050363739, + "grad_norm": 0.05566524341702461, + "learning_rate": 1.4498668438527597e-06, + "loss": 0.004, + "step": 18540 + }, + { + "epoch": 10.38052602126469, + "grad_norm": 0.07601140439510345, + "learning_rate": 1.4301685785866214e-06, + "loss": 0.0034, + "step": 18550 + }, + { + "epoch": 10.386121992165641, + "grad_norm": 0.10449633747339249, + "learning_rate": 1.4106031013849496e-06, + "loss": 0.0041, + "step": 18560 + }, + { + "epoch": 10.391717963066592, + "grad_norm": 0.15937356650829315, + "learning_rate": 1.3911704657390113e-06, + "loss": 0.0039, + "step": 18570 + }, + { + "epoch": 10.397313933967544, + "grad_norm": 0.059475306421518326, + "learning_rate": 1.3718707247769135e-06, + "loss": 0.006, + "step": 18580 + }, + { + "epoch": 10.402909904868494, + "grad_norm": 0.24354378879070282, + "learning_rate": 1.3527039312633827e-06, + "loss": 0.0042, + "step": 18590 + }, + { + "epoch": 10.408505875769446, + "grad_norm": 0.20878778398036957, + "learning_rate": 1.333670137599713e-06, + "loss": 0.0107, + "step": 18600 + }, + { + "epoch": 10.414101846670397, + "grad_norm": 0.1909496784210205, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0034, + "step": 18610 + }, + { + "epoch": 10.419697817571349, + "grad_norm": 0.13632823526859283, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0066, + "step": 18620 + }, + { + "epoch": 10.4252937884723, + "grad_norm": 0.10793755203485489, + "learning_rate": 1.2773672742655784e-06, + "loss": 0.0037, + "step": 18630 + }, + { + "epoch": 10.430889759373251, + "grad_norm": 0.10346037149429321, + "learning_rate": 1.2588659967397e-06, + "loss": 0.0044, + "step": 18640 + }, + { + "epoch": 10.436485730274203, + "grad_norm": 0.08834080398082733, + "learning_rate": 1.2404979756130142e-06, + "loss": 0.0037, + "step": 18650 + }, + { + "epoch": 10.442081701175153, + "grad_norm": 0.09045784175395966, + "learning_rate": 1.222263261102985e-06, + "loss": 0.0052, + "step": 18660 + }, + { + "epoch": 10.447677672076106, + "grad_norm": 0.07731129229068756, + "learning_rate": 1.2041619030626284e-06, + "loss": 0.0071, + "step": 18670 + }, + { + "epoch": 10.453273642977056, + "grad_norm": 0.08769071102142334, + "learning_rate": 1.1861939509803687e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 10.458869613878008, + "grad_norm": 0.15766629576683044, + "learning_rate": 1.1683594539798893e-06, + "loss": 0.0063, + "step": 18690 + }, + { + "epoch": 10.46446558477896, + "grad_norm": 0.11048921942710876, + "learning_rate": 1.1506584608200367e-06, + "loss": 0.0033, + "step": 18700 + }, + { + "epoch": 10.47006155567991, + "grad_norm": 0.25674813985824585, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0047, + "step": 18710 + }, + { + "epoch": 10.475657526580862, + "grad_norm": 0.09696432203054428, + "learning_rate": 1.1156571792324211e-06, + "loss": 0.0038, + "step": 18720 + }, + { + "epoch": 10.481253497481813, + "grad_norm": 0.17716100811958313, + "learning_rate": 1.0983569864968346e-06, + "loss": 0.0085, + "step": 18730 + }, + { + "epoch": 10.486849468382765, + "grad_norm": 0.18763263523578644, + "learning_rate": 1.0811904889859336e-06, + "loss": 0.009, + "step": 18740 + }, + { + "epoch": 10.492445439283715, + "grad_norm": 0.047968145459890366, + "learning_rate": 1.064157733632276e-06, + "loss": 0.0051, + "step": 18750 + }, + { + "epoch": 10.498041410184667, + "grad_norm": 0.1565999537706375, + "learning_rate": 1.0472587670027678e-06, + "loss": 0.0062, + "step": 18760 + }, + { + "epoch": 10.503637381085618, + "grad_norm": 0.06519567221403122, + "learning_rate": 1.030493635298535e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 10.50923335198657, + "grad_norm": 0.10364692658185959, + "learning_rate": 1.0138623843548078e-06, + "loss": 0.0051, + "step": 18780 + }, + { + "epoch": 10.514829322887522, + "grad_norm": 0.036633651703596115, + "learning_rate": 9.97365059640787e-07, + "loss": 0.0062, + "step": 18790 + }, + { + "epoch": 10.520425293788472, + "grad_norm": 0.2015930861234665, + "learning_rate": 9.810017062595322e-07, + "loss": 0.0037, + "step": 18800 + }, + { + "epoch": 10.526021264689424, + "grad_norm": 0.1180974468588829, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0039, + "step": 18810 + }, + { + "epoch": 10.531617235590375, + "grad_norm": 0.07416771352291107, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0041, + "step": 18820 + }, + { + "epoch": 10.537213206491327, + "grad_norm": 0.05668334290385246, + "learning_rate": 9.327159196481138e-07, + "loss": 0.0059, + "step": 18830 + }, + { + "epoch": 10.542809177392277, + "grad_norm": 0.07584750652313232, + "learning_rate": 9.168888953011989e-07, + "loss": 0.0054, + "step": 18840 + }, + { + "epoch": 10.548405148293229, + "grad_norm": 0.06703902035951614, + "learning_rate": 9.011960623058202e-07, + "loss": 0.0039, + "step": 18850 + }, + { + "epoch": 10.55400111919418, + "grad_norm": 0.06538796424865723, + "learning_rate": 8.856374635655695e-07, + "loss": 0.0035, + "step": 18860 + }, + { + "epoch": 10.559597090095131, + "grad_norm": 0.09234767407178879, + "learning_rate": 8.702131416170656e-07, + "loss": 0.0047, + "step": 18870 + }, + { + "epoch": 10.565193060996084, + "grad_norm": 0.09068552404642105, + "learning_rate": 8.549231386298151e-07, + "loss": 0.0032, + "step": 18880 + }, + { + "epoch": 10.570789031897034, + "grad_norm": 0.2574044466018677, + "learning_rate": 8.397674964061075e-07, + "loss": 0.0123, + "step": 18890 + }, + { + "epoch": 10.576385002797986, + "grad_norm": 0.1742398738861084, + "learning_rate": 8.247462563808817e-07, + "loss": 0.005, + "step": 18900 + }, + { + "epoch": 10.581980973698936, + "grad_norm": 0.19498533010482788, + "learning_rate": 8.098594596216424e-07, + "loss": 0.0051, + "step": 18910 + }, + { + "epoch": 10.587576944599888, + "grad_norm": 0.1093849390745163, + "learning_rate": 7.951071468283167e-07, + "loss": 0.0062, + "step": 18920 + }, + { + "epoch": 10.593172915500839, + "grad_norm": 0.05242215842008591, + "learning_rate": 7.804893583331696e-07, + "loss": 0.0049, + "step": 18930 + }, + { + "epoch": 10.59876888640179, + "grad_norm": 0.06830724328756332, + "learning_rate": 7.66006134100672e-07, + "loss": 0.0031, + "step": 18940 + }, + { + "epoch": 10.604364857302741, + "grad_norm": 0.08541436493396759, + "learning_rate": 7.516575137274162e-07, + "loss": 0.0044, + "step": 18950 + }, + { + "epoch": 10.609960828203693, + "grad_norm": 0.042029768228530884, + "learning_rate": 7.374435364419674e-07, + "loss": 0.0043, + "step": 18960 + }, + { + "epoch": 10.615556799104645, + "grad_norm": 0.12100391089916229, + "learning_rate": 7.233642411048014e-07, + "loss": 0.0032, + "step": 18970 + }, + { + "epoch": 10.621152770005596, + "grad_norm": 0.04842936620116234, + "learning_rate": 7.094196662081831e-07, + "loss": 0.0052, + "step": 18980 + }, + { + "epoch": 10.626748740906548, + "grad_norm": 0.13397961854934692, + "learning_rate": 6.956098498760389e-07, + "loss": 0.0056, + "step": 18990 + }, + { + "epoch": 10.632344711807498, + "grad_norm": 0.19486455619335175, + "learning_rate": 6.819348298638839e-07, + "loss": 0.0029, + "step": 19000 + }, + { + "epoch": 10.63794068270845, + "grad_norm": 0.1525876224040985, + "learning_rate": 6.683946435586952e-07, + "loss": 0.0142, + "step": 19010 + }, + { + "epoch": 10.6435366536094, + "grad_norm": 0.09059377759695053, + "learning_rate": 6.549893279788277e-07, + "loss": 0.0057, + "step": 19020 + }, + { + "epoch": 10.649132624510353, + "grad_norm": 0.08628048002719879, + "learning_rate": 6.417189197739093e-07, + "loss": 0.0059, + "step": 19030 + }, + { + "epoch": 10.654728595411305, + "grad_norm": 0.34853503108024597, + "learning_rate": 6.285834552247128e-07, + "loss": 0.0041, + "step": 19040 + }, + { + "epoch": 10.660324566312255, + "grad_norm": 0.1580825001001358, + "learning_rate": 6.15582970243117e-07, + "loss": 0.0059, + "step": 19050 + }, + { + "epoch": 10.665920537213207, + "grad_norm": 0.2064519226551056, + "learning_rate": 6.027175003719354e-07, + "loss": 0.0065, + "step": 19060 + }, + { + "epoch": 10.671516508114157, + "grad_norm": 0.1656566709280014, + "learning_rate": 5.899870807848762e-07, + "loss": 0.0045, + "step": 19070 + }, + { + "epoch": 10.67711247901511, + "grad_norm": 0.06346923857927322, + "learning_rate": 5.773917462864264e-07, + "loss": 0.0108, + "step": 19080 + }, + { + "epoch": 10.68270844991606, + "grad_norm": 0.0746588408946991, + "learning_rate": 5.64931531311741e-07, + "loss": 0.0038, + "step": 19090 + }, + { + "epoch": 10.688304420817012, + "grad_norm": 0.10566951334476471, + "learning_rate": 5.526064699265753e-07, + "loss": 0.0084, + "step": 19100 + }, + { + "epoch": 10.693900391717962, + "grad_norm": 0.061587151139974594, + "learning_rate": 5.404165958271811e-07, + "loss": 0.0042, + "step": 19110 + }, + { + "epoch": 10.699496362618914, + "grad_norm": 0.27593472599983215, + "learning_rate": 5.283619423401998e-07, + "loss": 0.005, + "step": 19120 + }, + { + "epoch": 10.705092333519866, + "grad_norm": 0.37827596068382263, + "learning_rate": 5.164425424226016e-07, + "loss": 0.0068, + "step": 19130 + }, + { + "epoch": 10.710688304420817, + "grad_norm": 0.2789309322834015, + "learning_rate": 5.046584286615697e-07, + "loss": 0.0054, + "step": 19140 + }, + { + "epoch": 10.716284275321769, + "grad_norm": 0.08417310565710068, + "learning_rate": 4.930096332744105e-07, + "loss": 0.0043, + "step": 19150 + }, + { + "epoch": 10.72188024622272, + "grad_norm": 0.13277283310890198, + "learning_rate": 4.814961881085045e-07, + "loss": 0.007, + "step": 19160 + }, + { + "epoch": 10.727476217123671, + "grad_norm": 0.029057292267680168, + "learning_rate": 4.701181246411501e-07, + "loss": 0.0077, + "step": 19170 + }, + { + "epoch": 10.733072188024622, + "grad_norm": 0.07132174074649811, + "learning_rate": 4.5887547397955864e-07, + "loss": 0.0044, + "step": 19180 + }, + { + "epoch": 10.738668158925574, + "grad_norm": 0.05213991925120354, + "learning_rate": 4.4776826686069305e-07, + "loss": 0.0022, + "step": 19190 + }, + { + "epoch": 10.744264129826526, + "grad_norm": 0.092039555311203, + "learning_rate": 4.367965336512403e-07, + "loss": 0.0032, + "step": 19200 + }, + { + "epoch": 10.749860100727476, + "grad_norm": 0.17352578043937683, + "learning_rate": 4.259603043475002e-07, + "loss": 0.0064, + "step": 19210 + }, + { + "epoch": 10.755456071628428, + "grad_norm": 0.15915948152542114, + "learning_rate": 4.1525960857530243e-07, + "loss": 0.0075, + "step": 19220 + }, + { + "epoch": 10.761052042529379, + "grad_norm": 0.21297423541545868, + "learning_rate": 4.0469447558995065e-07, + "loss": 0.0057, + "step": 19230 + }, + { + "epoch": 10.76664801343033, + "grad_norm": 0.17462663352489471, + "learning_rate": 3.9426493427611177e-07, + "loss": 0.0056, + "step": 19240 + }, + { + "epoch": 10.772243984331281, + "grad_norm": 0.10657753050327301, + "learning_rate": 3.839710131477492e-07, + "loss": 0.0089, + "step": 19250 + }, + { + "epoch": 10.777839955232233, + "grad_norm": 0.07254552841186523, + "learning_rate": 3.738127403480507e-07, + "loss": 0.003, + "step": 19260 + }, + { + "epoch": 10.783435926133183, + "grad_norm": 0.27843359112739563, + "learning_rate": 3.637901436493507e-07, + "loss": 0.0067, + "step": 19270 + }, + { + "epoch": 10.789031897034135, + "grad_norm": 0.17431190609931946, + "learning_rate": 3.5390325045304706e-07, + "loss": 0.0042, + "step": 19280 + }, + { + "epoch": 10.794627867935088, + "grad_norm": 0.11761761456727982, + "learning_rate": 3.441520877895288e-07, + "loss": 0.0036, + "step": 19290 + }, + { + "epoch": 10.800223838836038, + "grad_norm": 0.1055087074637413, + "learning_rate": 3.3453668231809286e-07, + "loss": 0.0049, + "step": 19300 + }, + { + "epoch": 10.80581980973699, + "grad_norm": 0.05716053023934364, + "learning_rate": 3.250570603268943e-07, + "loss": 0.0057, + "step": 19310 + }, + { + "epoch": 10.81141578063794, + "grad_norm": 0.06227661669254303, + "learning_rate": 3.157132477328628e-07, + "loss": 0.0047, + "step": 19320 + }, + { + "epoch": 10.817011751538892, + "grad_norm": 0.07587496936321259, + "learning_rate": 3.0650527008162513e-07, + "loss": 0.0058, + "step": 19330 + }, + { + "epoch": 10.822607722439843, + "grad_norm": 0.12384708225727081, + "learning_rate": 2.9743315254743833e-07, + "loss": 0.0044, + "step": 19340 + }, + { + "epoch": 10.828203693340795, + "grad_norm": 0.130027636885643, + "learning_rate": 2.8849691993311777e-07, + "loss": 0.0048, + "step": 19350 + }, + { + "epoch": 10.833799664241745, + "grad_norm": 0.03498604893684387, + "learning_rate": 2.796965966699927e-07, + "loss": 0.0076, + "step": 19360 + }, + { + "epoch": 10.839395635142697, + "grad_norm": 0.06795532256364822, + "learning_rate": 2.7103220681780615e-07, + "loss": 0.0046, + "step": 19370 + }, + { + "epoch": 10.84499160604365, + "grad_norm": 0.15649089217185974, + "learning_rate": 2.625037740646763e-07, + "loss": 0.0041, + "step": 19380 + }, + { + "epoch": 10.8505875769446, + "grad_norm": 0.19872230291366577, + "learning_rate": 2.5411132172700194e-07, + "loss": 0.0045, + "step": 19390 + }, + { + "epoch": 10.856183547845552, + "grad_norm": 0.1986837238073349, + "learning_rate": 2.458548727494292e-07, + "loss": 0.0034, + "step": 19400 + }, + { + "epoch": 10.861779518746502, + "grad_norm": 0.34645870327949524, + "learning_rate": 2.3773444970477955e-07, + "loss": 0.0059, + "step": 19410 + }, + { + "epoch": 10.867375489647454, + "grad_norm": 0.043271441012620926, + "learning_rate": 2.2975007479397738e-07, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 10.872971460548404, + "grad_norm": 0.10621374845504761, + "learning_rate": 2.219017698460002e-07, + "loss": 0.0107, + "step": 19430 + }, + { + "epoch": 10.878567431449357, + "grad_norm": 0.038412097841501236, + "learning_rate": 2.1418955631781202e-07, + "loss": 0.0025, + "step": 19440 + }, + { + "epoch": 10.884163402350307, + "grad_norm": 0.14375977218151093, + "learning_rate": 2.0661345529430775e-07, + "loss": 0.0063, + "step": 19450 + }, + { + "epoch": 10.889759373251259, + "grad_norm": 0.28644490242004395, + "learning_rate": 1.9917348748826335e-07, + "loss": 0.0037, + "step": 19460 + }, + { + "epoch": 10.895355344152211, + "grad_norm": 0.19371145963668823, + "learning_rate": 1.918696732402636e-07, + "loss": 0.0071, + "step": 19470 + }, + { + "epoch": 10.900951315053161, + "grad_norm": 0.11907006055116653, + "learning_rate": 1.847020325186577e-07, + "loss": 0.0049, + "step": 19480 + }, + { + "epoch": 10.906547285954113, + "grad_norm": 0.10020023584365845, + "learning_rate": 1.776705849195037e-07, + "loss": 0.0036, + "step": 19490 + }, + { + "epoch": 10.912143256855064, + "grad_norm": 0.12778791785240173, + "learning_rate": 1.7077534966650766e-07, + "loss": 0.0057, + "step": 19500 + }, + { + "epoch": 10.917739227756016, + "grad_norm": 0.06359223276376724, + "learning_rate": 1.6401634561098444e-07, + "loss": 0.0036, + "step": 19510 + }, + { + "epoch": 10.923335198656966, + "grad_norm": 0.07983513921499252, + "learning_rate": 1.5739359123178587e-07, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 10.928931169557918, + "grad_norm": 0.12060696631669998, + "learning_rate": 1.5090710463527836e-07, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 10.93452714045887, + "grad_norm": 0.10252276062965393, + "learning_rate": 1.4455690355525964e-07, + "loss": 0.0052, + "step": 19540 + }, + { + "epoch": 10.94012311135982, + "grad_norm": 0.10586907714605331, + "learning_rate": 1.383430053529422e-07, + "loss": 0.0025, + "step": 19550 + }, + { + "epoch": 10.945719082260773, + "grad_norm": 0.05571618303656578, + "learning_rate": 1.3226542701689215e-07, + "loss": 0.0045, + "step": 19560 + }, + { + "epoch": 10.951315053161723, + "grad_norm": 0.07698628306388855, + "learning_rate": 1.2632418516296262e-07, + "loss": 0.0039, + "step": 19570 + }, + { + "epoch": 10.956911024062675, + "grad_norm": 0.3049318790435791, + "learning_rate": 1.2051929603428825e-07, + "loss": 0.0036, + "step": 19580 + }, + { + "epoch": 10.962506994963626, + "grad_norm": 0.04247491434216499, + "learning_rate": 1.1485077550122402e-07, + "loss": 0.0086, + "step": 19590 + }, + { + "epoch": 10.968102965864578, + "grad_norm": 0.13998843729496002, + "learning_rate": 1.0931863906127327e-07, + "loss": 0.0032, + "step": 19600 + }, + { + "epoch": 10.973698936765528, + "grad_norm": 0.18532228469848633, + "learning_rate": 1.0392290183909304e-07, + "loss": 0.0053, + "step": 19610 + }, + { + "epoch": 10.97929490766648, + "grad_norm": 0.24849370121955872, + "learning_rate": 9.866357858642205e-08, + "loss": 0.0031, + "step": 19620 + }, + { + "epoch": 10.984890878567432, + "grad_norm": 0.04739070311188698, + "learning_rate": 9.354068368204739e-08, + "loss": 0.0055, + "step": 19630 + }, + { + "epoch": 10.990486849468383, + "grad_norm": 0.13325341045856476, + "learning_rate": 8.855423113177664e-08, + "loss": 0.0027, + "step": 19640 + }, + { + "epoch": 10.996082820369335, + "grad_norm": 0.15442515909671783, + "learning_rate": 8.37042345683714e-08, + "loss": 0.009, + "step": 19650 + }, + { + "epoch": 11.001678791270285, + "grad_norm": 0.20657239854335785, + "learning_rate": 7.899070725153613e-08, + "loss": 0.0063, + "step": 19660 + }, + { + "epoch": 11.007274762171237, + "grad_norm": 0.16029535233974457, + "learning_rate": 7.44136620678848e-08, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 11.012870733072187, + "grad_norm": 0.16476546227931976, + "learning_rate": 6.997311153086883e-08, + "loss": 0.0066, + "step": 19680 + }, + { + "epoch": 11.01846670397314, + "grad_norm": 0.12683425843715668, + "learning_rate": 6.566906778079917e-08, + "loss": 0.0052, + "step": 19690 + }, + { + "epoch": 11.024062674874092, + "grad_norm": 0.23135153949260712, + "learning_rate": 6.150154258476315e-08, + "loss": 0.0043, + "step": 19700 + }, + { + "epoch": 11.029658645775042, + "grad_norm": 0.1939716786146164, + "learning_rate": 5.747054733660773e-08, + "loss": 0.0077, + "step": 19710 + }, + { + "epoch": 11.035254616675994, + "grad_norm": 0.11450741440057755, + "learning_rate": 5.3576093056922906e-08, + "loss": 0.0079, + "step": 19720 + }, + { + "epoch": 11.040850587576944, + "grad_norm": 0.06929726153612137, + "learning_rate": 4.981819039300284e-08, + "loss": 0.0039, + "step": 19730 + }, + { + "epoch": 11.046446558477896, + "grad_norm": 0.11268885433673859, + "learning_rate": 4.619684961881254e-08, + "loss": 0.0047, + "step": 19740 + }, + { + "epoch": 11.052042529378847, + "grad_norm": 0.07555661350488663, + "learning_rate": 4.2712080634949024e-08, + "loss": 0.0038, + "step": 19750 + }, + { + "epoch": 11.057638500279799, + "grad_norm": 0.07180225849151611, + "learning_rate": 3.936389296864129e-08, + "loss": 0.0066, + "step": 19760 + }, + { + "epoch": 11.063234471180749, + "grad_norm": 0.2635197937488556, + "learning_rate": 3.615229577371149e-08, + "loss": 0.0047, + "step": 19770 + }, + { + "epoch": 11.068830442081701, + "grad_norm": 0.03527739644050598, + "learning_rate": 3.3077297830541584e-08, + "loss": 0.0047, + "step": 19780 + }, + { + "epoch": 11.074426412982653, + "grad_norm": 0.061606280505657196, + "learning_rate": 3.01389075460512e-08, + "loss": 0.0069, + "step": 19790 + }, + { + "epoch": 11.080022383883604, + "grad_norm": 0.14764872193336487, + "learning_rate": 2.7337132953697554e-08, + "loss": 0.0063, + "step": 19800 + }, + { + "epoch": 11.085618354784556, + "grad_norm": 0.13825170695781708, + "learning_rate": 2.467198171342e-08, + "loss": 0.0047, + "step": 19810 + }, + { + "epoch": 11.091214325685506, + "grad_norm": 0.40132373571395874, + "learning_rate": 2.214346111164556e-08, + "loss": 0.0058, + "step": 19820 + }, + { + "epoch": 11.096810296586458, + "grad_norm": 0.06293044239282608, + "learning_rate": 1.9751578061244504e-08, + "loss": 0.0093, + "step": 19830 + }, + { + "epoch": 11.102406267487408, + "grad_norm": 0.08641501516103745, + "learning_rate": 1.749633910153592e-08, + "loss": 0.0061, + "step": 19840 + }, + { + "epoch": 11.10800223838836, + "grad_norm": 0.06543342024087906, + "learning_rate": 1.5377750398265502e-08, + "loss": 0.0034, + "step": 19850 + }, + { + "epoch": 11.11359820928931, + "grad_norm": 0.0463268905878067, + "learning_rate": 1.3395817743561134e-08, + "loss": 0.0031, + "step": 19860 + }, + { + "epoch": 11.119194180190263, + "grad_norm": 0.18889687955379486, + "learning_rate": 1.1550546555960662e-08, + "loss": 0.0049, + "step": 19870 + }, + { + "epoch": 11.124790151091215, + "grad_norm": 0.33526870608329773, + "learning_rate": 9.841941880361916e-09, + "loss": 0.0068, + "step": 19880 + }, + { + "epoch": 11.130386121992165, + "grad_norm": 0.17259934544563293, + "learning_rate": 8.270008388022721e-09, + "loss": 0.0047, + "step": 19890 + }, + { + "epoch": 11.135982092893117, + "grad_norm": 0.24882031977176666, + "learning_rate": 6.834750376549792e-09, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 11.141578063794068, + "grad_norm": 0.05286456272006035, + "learning_rate": 5.536171769887632e-09, + "loss": 0.0059, + "step": 19910 + }, + { + "epoch": 11.14717403469502, + "grad_norm": 0.08882560580968857, + "learning_rate": 4.3742761183018784e-09, + "loss": 0.0063, + "step": 19920 + }, + { + "epoch": 11.15277000559597, + "grad_norm": 0.09571769833564758, + "learning_rate": 3.349066598362649e-09, + "loss": 0.0034, + "step": 19930 + }, + { + "epoch": 11.158365976496922, + "grad_norm": 0.07795775681734085, + "learning_rate": 2.4605460129556445e-09, + "loss": 0.0029, + "step": 19940 + }, + { + "epoch": 11.163961947397874, + "grad_norm": 0.07696644216775894, + "learning_rate": 1.7087167912710478e-09, + "loss": 0.0083, + "step": 19950 + }, + { + "epoch": 11.169557918298825, + "grad_norm": 0.26498469710350037, + "learning_rate": 1.0935809887702154e-09, + "loss": 0.0033, + "step": 19960 + }, + { + "epoch": 11.175153889199777, + "grad_norm": 0.165630042552948, + "learning_rate": 6.151402872134337e-10, + "loss": 0.007, + "step": 19970 + }, + { + "epoch": 11.180749860100727, + "grad_norm": 0.07009857147932053, + "learning_rate": 2.7339599464326627e-10, + "loss": 0.0037, + "step": 19980 + }, + { + "epoch": 11.18634583100168, + "grad_norm": 0.1754114180803299, + "learning_rate": 6.834904537900144e-11, + "loss": 0.0069, + "step": 19990 + }, + { + "epoch": 11.19194180190263, + "grad_norm": 0.18103741109371185, + "learning_rate": 0.0, + "loss": 0.0044, + "step": 20000 + }, + { + "epoch": 11.19194180190263, + "step": 20000, + "total_flos": 7.091345386565736e+17, + "train_loss": 0.01661205664295703, + "train_runtime": 10761.0808, + "train_samples_per_second": 29.737, + "train_steps_per_second": 1.859 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.091345386565736e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f6cc2bb0fcf76cc08e61f7032d90de2d538ff8d6 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42775082999f39fb52cf62b59cc53276d5a87183201ff8c7c9f8a897ce8f855e +size 5240